In [16]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, regexp_replace, count
from pyspark.sql.types import IntegerType
from sedona.spark import *
import time

An error was encountered:
Invalid status code '404' from http://ec2-35-159-120-182.eu-central-1.compute.amazonaws.com:8998/sessions/1204 with error payload: {"msg":"Session '1204' not found."}


In [2]:
# Spark SQL code
spark = SparkSession \
    .builder \
    .appName("Query 3") \
    .getOrCreate()

# Create sedona context
sedona = SedonaContext.create(spark)
# Read the file from s3
geojson_path = "s3://initial-notebook-data-bucket-dblab-905418150721/2010_Census_Blocks.geojson"
blocks_df = sedona.read.format("geojson") \
            .option("multiLine", "true").load(geojson_path) \
            .selectExpr("explode(features) as features") \
            .select("features.*")
# Formatting magic
flattened_df = blocks_df.select( \
                [col(f"properties.{col_name}").alias(col_name) for col_name in \
                blocks_df.schema["properties"].dataType.fieldNames()] + ["geometry"]) \
            .drop("properties") \
            .drop("type")
# Print schema
flattened_df.printSchema()


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- BG10: string (nullable = true)
 |-- BG10FIP10: string (nullable = true)
 |-- BG12: string (nullable = true)
 |-- CB10: string (nullable = true)
 |-- CEN_FIP13: string (nullable = true)
 |-- CITY: string (nullable = true)
 |-- CITYCOM: string (nullable = true)
 |-- COMM: string (nullable = true)
 |-- CT10: string (nullable = true)
 |-- CT12: string (nullable = true)
 |-- CTCB10: string (nullable = true)
 |-- HD_2012: long (nullable = true)
 |-- HD_NAME: string (nullable = true)
 |-- HOUSING10: long (nullable = true)
 |-- LA_FIP10: string (nullable = true)
 |-- OBJECTID: long (nullable = true)
 |-- POP_2010: long (nullable = true)
 |-- PUMA10: string (nullable = true)
 |-- SPA_2012: long (nullable = true)
 |-- SPA_NAME: string (nullable = true)
 |-- SUP_DIST: string (nullable = true)
 |-- SUP_LABEL: string (nullable = true)
 |-- ShapeSTArea: double (nullable = true)
 |-- ShapeSTLength: double (nullable = true)
 |-- ZCTA10: string (nullable = true)
 |-- geometry: geometry (nulla

In [3]:
LA_areas = flattened_df.groupBy("COMM").agg(ST_Union_Aggr("geometry").alias("geometry"))
LA_areas.printSchema()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

root
 |-- COMM: string (nullable = true)
 |-- geometry: geometry (nullable = true)

In [4]:
# income_schema = StructType([
#     StructField("Zip Code", IntegerType()),
#     StructField("Community", StringType()),
#     StructField("Estimated Median Income", FloatType()),
# ])

# income_df = spark.read.format('csv') \
#     .options(header='false') \
#     .schema(income_schema) \
#     .load("s3://initial-notebook-data-bucket-dblab-905418150721/LA_income_2015.csv")
income_df = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/LA_income_2015.csv", header=True, inferSchema=True)

income_df = income_df.withColumn("Median Income", regexp_replace(col("Estimated Median Income"), "[$,]", "").cast(IntegerType()))
income_df = income_df.withColumnRenamed("Zip Code", "ZIP Code")

crime_df_1 = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv", header=True, inferSchema=True)
crime_df_2 = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv", header=True, inferSchema=True)
crime_df = crime_df_1.union(crime_df_2).select("LAT", "LON")
crime_df = crime_df.withColumn("geom", ST_Point("LON", "LAT"))
crime_df.show()

# Verify data in income_df
print("Income Data Sample:")
income_df.show(5)



FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------+---------+--------------------+
|    LAT|      LON|                geom|
+-------+---------+--------------------+
|33.9825|-118.2695|POINT (-118.2695 ...|
|33.9599|-118.3962|POINT (-118.3962 ...|
|34.0224|-118.2524|POINT (-118.2524 ...|
|34.1016|-118.3295|POINT (-118.3295 ...|
|34.0387|-118.2488|POINT (-118.2488 ...|
| 34.048|-118.2577|POINT (-118.2577 ...|
|34.0389|-118.2643|POINT (-118.2643 ...|
|34.0435|-118.2427|POINT (-118.2427 ...|
| 34.045| -118.264|POINT (-118.264 3...|
|34.0538|-118.2488|POINT (-118.2488 ...|
| 34.064|-118.2375|POINT (-118.2375 ...|
| 34.035|-118.2386|POINT (-118.2386 ...|
|34.0409|-118.2609|POINT (-118.2609 ...|
|34.0502| -118.254|POINT (-118.254 3...|
|34.0515|-118.2424|POINT (-118.2424 ...|
|34.0389| -118.255|POINT (-118.255 3...|
|34.0401|-118.2668|POINT (-118.2668 ...|
|34.0428|-118.2461|POINT (-118.2461 ...|
|34.0545|-118.2499|POINT (-118.2499 ...|
|34.0563|-118.2374|POINT (-118.2374 ...|
+-------+---------+--------------------+
only showing top

In [6]:
flattened_df.createOrReplaceTempView("census_blocks")
income_df.createOrReplaceTempView("income")
crime_df.createOrReplaceTempView("crime")
LA_areas.createOrReplaceTempView("LA_areas")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

In [7]:
inner_join_query = "SELECT * \
FROM census_blocks cb \
JOIN income i \
ON cb.ZCTA10 = i.`ZIP Code`;"

joined_data_income = spark.sql(inner_join_query)
joined_data_income.show(3)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-------+------------+-------+----+---------+-------------------+--------------------+-------------------+------+------+----------+-------+-------+---------+--------+--------+--------+------+--------+-----------+--------+----------+------------------+------------------+------+--------------------+--------+--------------------+-----------------------+-------------+
|   BG10|   BG10FIP10|   BG12|CB10|CEN_FIP13|               CITY|             CITYCOM|               COMM|  CT10|  CT12|    CTCB10|HD_2012|HD_NAME|HOUSING10|LA_FIP10|OBJECTID|POP_2010|PUMA10|SPA_2012|   SPA_NAME|SUP_DIST| SUP_LABEL|       ShapeSTArea|     ShapeSTLength|ZCTA10|            geometry|ZIP Code|           Community|Estimated Median Income|Median Income|
+-------+------------+-------+----+---------+-------------------+--------------------+-------------------+------+------+----------+-------+-------+---------+--------+--------+--------+------+--------+-----------+--------+----------+------------------+---------------

In [8]:
joined_data_income.explain(mode="formatted")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

== Physical Plan ==
AdaptiveSparkPlan (11)
+- BroadcastHashJoin Inner BuildRight (10)
   :- Project (5)
   :  +- Filter (4)
   :     +- Generate (3)
   :        +- Filter (2)
   :           +- Scan geojson  (1)
   +- BroadcastExchange (9)
      +- Project (8)
         +- Filter (7)
            +- Scan csv  (6)


(1) Scan geojson 
Output [1]: [features#25]
Batched: false
Location: InMemoryFileIndex [s3://initial-notebook-data-bucket-dblab-905418150721/2010_Census_Blocks.geojson]
PushedFilters: [IsNotNull(features)]
ReadSchema: struct<features:array<struct<geometry:binary,properties:struct<BG10:string,BG10FIP10:string,BG12:string,CB10:string,CEN_FIP13:string,CITY:string,CITYCOM:string,COMM:string,CT10:string,CT12:string,CTCB10:string,HD_2012:bigint,HD_NAME:string,HOUSING10:bigint,LA_FIP10:string,OBJECTID:bigint,POP_2010:bigint,PUMA10:string,SPA_2012:bigint,SPA_NAME:string,SUP_DIST:string,SUP_LABEL:string,ShapeSTArea:double,ShapeSTLength:double,ZCTA10:string>,type:string>>>

(2) Filter
In

In [9]:
# SHUFFLE_HASH join
shuffle_hash_join_query = """
    SELECT /*+ SHUFFLE_HASH(cb) */ *
    FROM census_blocks cb
    JOIN income i
    ON cb.ZCTA10 = i.`ZIP Code`
"""
start_time = time.time()
joined_data_income = spark.sql(shuffle_hash_join_query)
print(f"Shuffle Hash Join Time: {time.time() - start_time:.2f} seconds\n")
joined_data_income.show(3)
print("---------------------------------------------------------------------------------------------------------------------------------")
print("---------------------------------------------------------------------------------------------------------------------------------")
joined_data_income.explain(mode="formatted")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Shuffle Hash Join Time: 0.02 seconds

+-------+------------+-------+----+---------+--------------+--------------------+--------------+------+------+----------+-------+------------------+---------+--------+--------+--------+------+--------+--------+--------+----------+-----------------+------------------+------+--------------------+--------+--------------------+-----------------------+-------------+
|   BG10|   BG10FIP10|   BG12|CB10|CEN_FIP13|          CITY|             CITYCOM|          COMM|  CT10|  CT12|    CTCB10|HD_2012|           HD_NAME|HOUSING10|LA_FIP10|OBJECTID|POP_2010|PUMA10|SPA_2012|SPA_NAME|SUP_DIST| SUP_LABEL|      ShapeSTArea|     ShapeSTLength|ZCTA10|            geometry|ZIP Code|           Community|Estimated Median Income|Median Income|
+-------+------------+-------+----+---------+--------------+--------------------+--------------+------+------+----------+-------+------------------+---------+--------+--------+--------+------+--------+--------+--------+----------+----

In [10]:
# BROADCAST join
broadcast_join_query = """
    SELECT /*+ BROADCAST(cb) */ *
    FROM census_blocks cb
    JOIN income i
    ON cb.ZCTA10 = i.`ZIP Code`
"""
start_time = time.time()
joined_data_income = spark.sql(broadcast_join_query)
print(f"Broadcast Join Time: {time.time() - start_time:.2f} seconds\n")
joined_data_income.show(3)
print("---------------------------------------------------------------------------------------------------------------------------------")
print("---------------------------------------------------------------------------------------------------------------------------------")
joined_data_income.explain(mode="formatted")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Broadcast Join Time: 0.02 seconds

+-------+------------+-------+----+---------+--------------+--------------------+------------------+------+------+----------+-------+-------+---------+--------+--------+--------+------+--------+--------+--------+----------+------------------+------------------+------+--------------------+--------+--------------------+-----------------------+-------------+
|   BG10|   BG10FIP10|   BG12|CB10|CEN_FIP13|          CITY|             CITYCOM|              COMM|  CT10|  CT12|    CTCB10|HD_2012|HD_NAME|HOUSING10|LA_FIP10|OBJECTID|POP_2010|PUMA10|SPA_2012|SPA_NAME|SUP_DIST| SUP_LABEL|       ShapeSTArea|     ShapeSTLength|ZCTA10|            geometry|ZIP Code|           Community|Estimated Median Income|Median Income|
+-------+------------+-------+----+---------+--------------+--------------------+------------------+------+------+----------+-------+-------+---------+--------+--------+--------+------+--------+--------+--------+----------+------------------+-------

In [11]:
# MERGE join
merge_join_query = """
    SELECT /*+ MERGE(cb, i) */ *
    FROM census_blocks cb
    JOIN income i
    ON cb.ZCTA10 = i.`ZIP Code`
"""
start_time = time.time()
joined_data_income = spark.sql(merge_join_query)
print(f"Merge Join Time: {time.time() - start_time:.2f} seconds\n")
joined_data_income.show(3)
print("---------------------------------------------------------------------------------------------------------------------------------")
print("---------------------------------------------------------------------------------------------------------------------------------")
joined_data_income.explain(mode="formatted")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Merge Join Time: 0.03 seconds

+-------+------------+-------+----+---------+--------------+--------------------+--------------+------+------+----------+-------+------------------+---------+--------+--------+--------+------+--------+--------+--------+----------+------------------+------------------+------+--------------------+--------+--------------------+-----------------------+-------------+
|   BG10|   BG10FIP10|   BG12|CB10|CEN_FIP13|          CITY|             CITYCOM|          COMM|  CT10|  CT12|    CTCB10|HD_2012|           HD_NAME|HOUSING10|LA_FIP10|OBJECTID|POP_2010|PUMA10|SPA_2012|SPA_NAME|SUP_DIST| SUP_LABEL|       ShapeSTArea|     ShapeSTLength|ZCTA10|            geometry|ZIP Code|           Community|Estimated Median Income|Median Income|
+-------+------------+-------+----+---------+--------------+--------------------+--------------+------+------+----------+-------+------------------+---------+--------+--------+--------+------+--------+--------+--------+----------+---------

In [12]:
# SHUFFLE_REPLICATE_NL join
shuffle_replicate_nl_join_query = """
    SELECT /*+ SHUFFLE_REPLICATE_NL(cb) */ *
    FROM census_blocks cb
    JOIN income i
    ON cb.ZCTA10 = i.`ZIP Code`
"""
start_time = time.time()
joined_data_income = spark.sql(shuffle_replicate_nl_join_query)
print(f"Shuffle Replicate NL Join Time: {time.time() - start_time:.2f} seconds\n")
joined_data_income.show(3)
print("---------------------------------------------------------------------------------------------------------------------------------")
print("---------------------------------------------------------------------------------------------------------------------------------")
joined_data_income.explain(mode="formatted")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Shuffle Replicate NL Join Time: 0.02 seconds

+-------+------------+-------+----+---------+-------------------+--------------------+-------------------+------+------+----------+-------+-------+---------+--------+--------+--------+------+--------+-----------+--------+----------+------------------+------------------+------+--------------------+--------+--------------------+-----------------------+-------------+
|   BG10|   BG10FIP10|   BG12|CB10|CEN_FIP13|               CITY|             CITYCOM|               COMM|  CT10|  CT12|    CTCB10|HD_2012|HD_NAME|HOUSING10|LA_FIP10|OBJECTID|POP_2010|PUMA10|SPA_2012|   SPA_NAME|SUP_DIST| SUP_LABEL|       ShapeSTArea|     ShapeSTLength|ZCTA10|            geometry|ZIP Code|           Community|Estimated Median Income|Median Income|
+-------+------------+-------+----+---------+-------------------+--------------------+-------------------+------+------+----------+-------+-------+---------+--------+--------+--------+------+--------+-----------+--------

In [13]:
joined_data_income.createOrReplaceTempView("joined_data_income")
group_query = '''SELECT \
    COMM, \
    SUM(POP_2010) AS Total_POP_2010, \
    SUM(HOUSING10) AS Total_HOUSING10, \
    AVG(`Median Income`) AS Avg_Median_Income \
FROM  \
    joined_data_income \
GROUP BY  \
    COMM;'''
grouped_data_income = spark.sql(group_query)
grouped_data_income.show()

# group_query = '''
# SELECT 
#     COMM, 
#     SUM(POP_2010) AS Total_POP_2010, 
#     SUM(HOUSING10) AS Total_HOUSING10, 
#     AVG(`Median Income`) AS Avg_Median_Income,
#     (AVG(`Median Income`) * SUM(HOUSING10)) / SUM(POP_2010) AS Avg_Annual_Income_Per_Person,
#     ST_Union_Aggr(geometry) AS aggregated_geometry
# FROM  
#     joined_data
# GROUP BY  
#     COMM
# '''
# grouped_data_income = spark.sql(group_query)
# grouped_data_income.show(10)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+--------------+---------------+------------------+
|                COMM|Total_POP_2010|Total_HOUSING10| Avg_Median_Income|
+--------------------+--------------+---------------+------------------+
|         Culver City|         38883|          17491| 74486.27088948787|
|     North Lancaster|          1101|            559| 47855.70053475936|
|Rosewood/East Gar...|          1164|            354|           53306.0|
|East Rancho Domin...|         15135|           3186|           41960.0|
|      Toluca Terrace|          1301|            541|           48499.0|
|        Elysian Park|          5267|           1993|  35151.9801980198|
|            Longwood|          4210|           1474|           38330.0|
|         Pico Rivera|         62942|          17109|56332.066584463624|
|              Malibu|         12645|           6864|123604.49367088608|
|       Green Meadows|         19821|           5204|30573.460674157304|
|    Hacienda Heights|         53594|          1652

In [8]:
join_query = '''SELECT crime.*, LA_areas.* \
FROM crime \
INNER JOIN LA_areas \
ON ST_Within(crime.geom, LA_areas.geometry);'''

joined_data_crime= spark.sql(join_query)
joined_data_crime.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----+--------+--------------------+-----------------+--------------------+
|  LAT|     LON|                geom|             COMM|            geometry|
+-----+--------+--------------------+-----------------+--------------------+
|33.96|-118.309|POINT (-118.309 3...|Manchester Square|POLYGON ((-118.31...|
|33.96|-118.309|POINT (-118.309 3...|Manchester Square|POLYGON ((-118.31...|
|33.96|-118.309|POINT (-118.309 3...|Manchester Square|POLYGON ((-118.31...|
|33.96|-118.309|POINT (-118.309 3...|Manchester Square|POLYGON ((-118.31...|
|33.96|-118.309|POINT (-118.309 3...|Manchester Square|POLYGON ((-118.31...|
|33.96|-118.309|POINT (-118.309 3...|Manchester Square|POLYGON ((-118.31...|
|33.96|-118.309|POINT (-118.309 3...|Manchester Square|POLYGON ((-118.31...|
|33.96|-118.309|POINT (-118.309 3...|Manchester Square|POLYGON ((-118.31...|
|33.96|-118.309|POINT (-118.309 3...|Manchester Square|POLYGON ((-118.31...|
|33.96|-118.309|POINT (-118.309 3...|Manchester Square|POLYGON ((-118.31...|

In [9]:
grouped_data_crime = joined_data_crime.groupBy("COMM").agg(count("geom").alias("crime_count"))
grouped_data_crime.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+-----------+
|                COMM|crime_count|
+--------------------+-----------+
|      Gramercy Place|      11148|
|     Athens-Westmont|        665|
|Figueroa Park Square|       8180|
|      Vermont Knolls|      20367|
|       Vermont Vista|      50165|
|              Venice|      40942|
|        Santa Monica|        795|
|      Marina del Rey|        702|
|    Marina Peninsula|       2656|
|         Culver City|       1390|
|       Baldwin Hills|      34194|
|View Park/Windsor...|       1306|
|   Crenshaw District|      13297|
|       Beverly Crest|       4527|
|         Studio City|      19245|
|     Franklin Canyon|          2|
|      Valley Village|      15317|
|        Sherman Oaks|      64370|
|            Longwood|       3074|
|    Lafayette Square|       3982|
+--------------------+-----------+
only showing top 20 rows

In [14]:
grouped_data_crime.createOrReplaceTempView("grouped_data_crime")
grouped_data_income.createOrReplaceTempView("grouped_data_income")

# Group by COMM and count the geom column
final_join = '''
SELECT  \
    gd_income.COMM,  \
    gd_income.Avg_Median_Income,  \
    (gd_crime.crime_count / gd_income.Total_POP_2010) AS crimes_per_person \
FROM  \
    grouped_data_income gd_income \
JOIN  \
    grouped_data_crime gd_crime \
ON  \
    gd_income.COMM = gd_crime.COMM;
'''
final_data= spark.sql(final_join)
final_data.show()



FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+---------------+------------------+--------------------+
|           COMM| Avg_Median_Income|   crimes_per_person|
+---------------+------------------+--------------------+
|       Van Nuys|43827.914666666664|  0.9170415838361292|
|    North Hills| 56409.98540145985|   0.655059866962306|
|     Northridge|60909.710144927536|  0.8133372019923594|
|         Encino|  88326.6334231806|   0.703747432052705|
|North Hollywood| 46568.82010582011|  0.8013964204433706|
|    Canoga Park| 52561.75073313783|  0.8960521181480413|
|         Reseda|  51637.0938697318|  0.6108411423161002|
|  Panorama City| 37423.00787401575|  0.6498237536867851|
|    Lake Balboa| 49953.90217391304|  0.6017674035141959|
|       Winnetka| 62067.63815789474|  0.6688561626642122|
|   Reseda Ranch|           51743.6|  0.7724644128113879|
|      Westhills|          100075.0| 0.01278772378516624|
|     West Hills| 80861.76146788991|  0.4833172462369238|
|        Burbank| 67848.98430899215|0.009763886200890266|
|        Tarza