In [3]:
# Query 1 RDD

import csv,time
from io import StringIO
from pyspark.sql import SparkSession
from pyspark.sql import Row

sc = SparkSession \
    .builder \
    .appName("Query 1 RDD") \
    .config("spark.executor.instances", "4") \
    .getOrCreate() \
    .sparkContext

def parse_csv_line(line):
    f = StringIO(line)
    reader = csv.reader(f)
    return next(reader) 

def help1(data):
    try:
        age=int(data)
        if age<18 and age>0:
            return "child"
        if age<25:
            return "young adult"
        if age<65 :
            return "adult"
        if age>64:
            return "old"
        else:
            return "no individual victim"
    except:
        return "error"
    

start_time = time.time()
    
rdd1  = sc.textFile("s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv")\
.map(parse_csv_line)

rdd2= sc.textFile("s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv")\
.map(parse_csv_line)

header1 = rdd1.first()
header2 = rdd2.first()

# Filter out the header
rdd1_data = rdd1.filter(lambda line: line != header1)
rdd2_data = rdd2.filter(lambda line: line != header2)

crime_data = rdd1_data.union(rdd2_data) \
.filter(lambda pair: pair[9].find("AGGRAVATED") != -1 ) \
.map(lambda data: (help1(data[11]), 1)) \
.reduceByKey(lambda a, b: a + b) \
.sortBy( lambda pair : pair[1], ascending=False )

print(crime_data.collect())

end_time = time.time()

print(f"Time taken: {end_time-start_time:.2f} seconds")


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

[('adult', 121093), ('young adult', 38703), ('child', 10830), ('old', 5985)]
Time taken: 22.19 seconds

In [2]:
####query 1 dataframe

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, TimestampType
from pyspark.sql.functions import udf, col
from pyspark.sql import functions as F
import time


spark = SparkSession \
    .builder \
    .appName("Query 1 Dataframe") \
    .config("spark.executor.instances", "4") \
    .getOrCreate() 


def age_group(age_str):
    try:
        age=int(age_str)
        if age<18 and age>0:
            return "child"
        if age<25:
            return "young adult"
        if age<65 :
            return "adult"
        if age>64:
            return "old"
        else:
            return "no individual victim"
    except:
        return "error"


start_time=time.time()
dataframe1= spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv", header=True)
dataframe2= spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv",header=True)

age_udf=udf(age_group,StringType())
dataframe=dataframe1.union(dataframe2)\
.filter(col("Crm Cd Desc").contains("AGGRAVATED"))\
.withColumn("age_group",age_udf(col("Vict Age")))\
.groupBy("age_group").agg(F.count("*").alias("count"))\
.orderBy("count",ascending=False)


dataframe.show()


end_time = time.time()

print(f"Time taken: {end_time-start_time:.2f} seconds")



FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----------+------+
|  age_group| count|
+-----------+------+
|      adult|121093|
|young adult| 38703|
|      child| 10830|
|        old|  5985|
+-----------+------+

Time taken: 11.91 seconds

In [11]:
####query2a dataframe

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, TimestampType
from pyspark.sql.functions import udf, col,count,when
from pyspark.sql import functions as F
from pyspark.sql.window import Window

import time


spark = SparkSession \
    .builder \
    .appName("Query 2 Dataframe") \
    .getOrCreate() 


window_spec = Window.partitionBy("Year").orderBy(F.desc("closed_case_rate"))


start_time=time.time()


dataframe1= spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv", header=True)
dataframe2= spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv",header=True)



dataframe=dataframe1.union(dataframe2)\
.withColumn("year",col("Date Rptd").substr(7,4))\
.select("year","AREA NAME","Status")\
.groupBy("year","AREA NAME").agg((count(when(col("Status") != "IC", 1)) / count("*") ).alias("closed_case_rate"))\
.withColumn("#", F.row_number().over(window_spec) )\
.filter(col("#") <= 3)\
.withColumnRenamed("AREA NAME", "precinct")

dataframe.show(24)

end_time = time.time()

print(f"Time taken: {end_time-start_time:.2f} seconds")


    




FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----+-----------+-------------------+---+
|year|   precinct|   closed_case_rate|  #|
+----+-----------+-------------------+---+
|2010|    Rampart|0.32947355855318133|  1|
|2010|    Olympic|0.31962706191728424|  2|
|2010|     Harbor| 0.2963203463203463|  3|
|2011|    Olympic|0.35212167689161555|  1|
|2011|    Rampart|0.32511779630300836|  2|
|2011|     Harbor| 0.2865220520201501|  3|
|2012|    Olympic| 0.3441481831052383|  1|
|2012|    Rampart|  0.329464181029429|  2|
|2012|     Harbor| 0.2981513327601032|  3|
|2013|    Olympic| 0.3352812271731191|  1|
|2013|    Rampart| 0.3208287360549221|  2|
|2013|     Harbor| 0.2916422459266206|  3|
|2014|   Van Nuys| 0.3180567315834039|  1|
|2014|West Valley| 0.3131198995605775|  2|
|2014|    Mission| 0.3116279069767442|  3|
|2015|   Van Nuys| 0.3264134698172773|  1|
|2015|West Valley| 0.3027597402597403|  2|
|2015|    Mission|0.30179460678380154|  3|
|2016|   Van Nuys|0.31880755720117726|  1|
|2016|West Valley| 0.3154798761609907|  2|
|2016|   Fo

In [12]:
###query2a spark sql api


from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, TimestampType
from pyspark.sql.functions import udf, col,count,when
from pyspark.sql import functions as F
from pyspark.sql.window import Window

import time


spark = SparkSession \
    .builder \
    .appName("Query 2 SQL API") \
    .getOrCreate()


start_time=time.time()

dataframe1= spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv", header=True)
dataframe2= spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv",header=True)

dataframe=dataframe1.union(dataframe2)

dataframe.createOrReplaceTempView("Dataset")
query= """
    WITH extracted_data AS (
        SELECT 
            substr(`Date Rptd`, 7, 4) AS year,
            `AREA NAME` AS precinct,
            Status
        FROM Dataset
    ),
    aggregated_data AS (
        SELECT
            year,
            precinct,
            COUNT(CASE WHEN Status != 'IC' THEN 1 END)  / COUNT(*) AS closed_case_rate
        FROM extracted_data
        GROUP BY year, precinct
    ),
    ranked_data AS (
        SELECT
            year,
            precinct,
            closed_case_rate,
            ROW_NUMBER() OVER (PARTITION BY year ORDER BY closed_case_rate DESC) AS `#`
        FROM aggregated_data
    )
    SELECT 
        year,
        precinct,
        closed_case_rate,
        `#`
    FROM ranked_data
    WHERE `#` <= 3
"""
res=spark.sql(query)
res.show(24)

end_time = time.time()

print(f"Time taken: {end_time-start_time:.2f} seconds")



FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----+-----------+-------------------+---+
|year|   precinct|   closed_case_rate|  #|
+----+-----------+-------------------+---+
|2010|    Rampart|0.32947355855318133|  1|
|2010|    Olympic|0.31962706191728424|  2|
|2010|     Harbor| 0.2963203463203463|  3|
|2011|    Olympic|0.35212167689161555|  1|
|2011|    Rampart|0.32511779630300836|  2|
|2011|     Harbor| 0.2865220520201501|  3|
|2012|    Olympic| 0.3441481831052383|  1|
|2012|    Rampart|  0.329464181029429|  2|
|2012|     Harbor| 0.2981513327601032|  3|
|2013|    Olympic| 0.3352812271731191|  1|
|2013|    Rampart| 0.3208287360549221|  2|
|2013|     Harbor| 0.2916422459266206|  3|
|2014|   Van Nuys| 0.3180567315834039|  1|
|2014|West Valley| 0.3131198995605775|  2|
|2014|    Mission| 0.3116279069767442|  3|
|2015|   Van Nuys| 0.3264134698172773|  1|
|2015|West Valley| 0.3027597402597403|  2|
|2015|    Mission|0.30179460678380154|  3|
|2016|   Van Nuys|0.31880755720117726|  1|
|2016|West Valley| 0.3154798761609907|  2|
|2016|   Fo

In [None]:
####2b
####make parquet dataset


from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, TimestampType
from pyspark.sql.functions import udf, col,count,when
from pyspark.sql import functions as F
from pyspark.sql.window import Window

import time


spark = SparkSession \
    .builder \
    .appName("Query 2b write parquet") \
    .getOrCreate()

dataframe1= spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv", header=True)
dataframe2= spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv",header=True)
dataframe=dataframe1.union(dataframe2)

dataframe.coalesce(1).write.mode("overwrite").parquet("s3://groups-bucket-dblab-905418150721/group35/main_dataset_parquet") ##coalesce gia 1 file






In [20]:
####2b test parquet file on 2a query dataframe

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, TimestampType
from pyspark.sql.functions import udf, col,count,when
from pyspark.sql import functions as F
from pyspark.sql.window import Window

import time


spark = SparkSession \
    .builder \
    .appName("Query 2b test parquet for 2b Dataframe") \
    .getOrCreate()

start_time=time.time()

dataframe = spark.read.parquet("s3://groups-bucket-dblab-905418150721/group35/main_dataset_parquet")

dataframe=dataframe.withColumn("year",col("Date Rptd").substr(7,4))\
.select("year","AREA NAME","Status")\
.groupBy("year","AREA NAME").agg((count(when(col("Status") != "IC", 1)) / count("*") ).alias("closed_case_rate"))\
.withColumn("#", F.row_number().over(window_spec) )\
.filter(col("#") <= 3)\
.withColumnRenamed("AREA NAME", "precinct")

dataframe.show(24)

end_time = time.time()

print(f"Time taken: {end_time-start_time:.2f} seconds")



FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----+-----------+-------------------+---+
|year|   precinct|   closed_case_rate|  #|
+----+-----------+-------------------+---+
|2010|    Rampart|0.32947355855318133|  1|
|2010|    Olympic|0.31962706191728424|  2|
|2010|     Harbor| 0.2963203463203463|  3|
|2011|    Olympic|0.35212167689161555|  1|
|2011|    Rampart|0.32511779630300836|  2|
|2011|     Harbor| 0.2865220520201501|  3|
|2012|    Olympic| 0.3441481831052383|  1|
|2012|    Rampart|  0.329464181029429|  2|
|2012|     Harbor| 0.2981513327601032|  3|
|2013|    Olympic| 0.3352812271731191|  1|
|2013|    Rampart| 0.3208287360549221|  2|
|2013|     Harbor| 0.2916422459266206|  3|
|2014|   Van Nuys| 0.3180567315834039|  1|
|2014|West Valley| 0.3131198995605775|  2|
|2014|    Mission| 0.3116279069767442|  3|
|2015|   Van Nuys| 0.3264134698172773|  1|
|2015|West Valley| 0.3027597402597403|  2|
|2015|    Mission|0.30179460678380154|  3|
|2016|   Van Nuys|0.31880755720117726|  1|
|2016|West Valley| 0.3154798761609907|  2|
|2016|   Fo

In [35]:
####query 3 NO HINT!!!!!

from sedona.spark import *
from pyspark.sql.functions import col
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import time

# Create spark Session
spark = SparkSession.builder \
    .appName("GeoJSON read") \
    .getOrCreate()

# Create sedona context
sedona = SedonaContext.create(spark)
# Read the file from s3
geojson_path = "s3://initial-notebook-data-bucket-dblab-905418150721/2010_Census_Blocks.geojson"

start_time=time.time()
blocks_df = sedona.read.format("geojson") \
            .option("multiLine", "true").load(geojson_path) \
            .selectExpr("explode(features) as features") \
            .select("features.*")
# Formatting magic
blocks_census = blocks_df.select( \
                [col(f"properties.{col_name}").alias(col_name) for col_name in \
                blocks_df.schema["properties"].dataType.fieldNames()] + ["geometry"]) \
            .drop("properties") \
            .drop("type") \
            .dropna(subset=["COMM","ZCTA10"])  \


census = blocks_census \
    .select("COMM", "POP_2010", "ZCTA10", "HOUSING10") \
    .na.fill({"POP_2010": 0, "HOUSING10": 0}) \
    .groupBy("COMM", "ZCTA10") \
    .agg(
        F.sum("POP_2010").alias("TOTAL_POP_2010"),
        F.sum("HOUSING10").alias("TOTAL_HOUSING10")
    )

income= spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/LA_income_2015.csv", header=True)

res1= income.withColumn( "Estimated Median Income", F.regexp_replace(col("Estimated Median Income"), "[$,.]", "").cast("float"))\
.join(census,census["ZCTA10"]==income["Zip Code"])\
.withColumn("total_income",col("Estimated Median Income")*col("TOTAL_HOUSING10") )\
.groupBy("COMM").agg(
                    F.sum("TOTAL_POP_2010").alias("total_population"),
                    F.sum("total_income").alias("comm_total_income"))\
.withColumn("Average Income", col("comm_total_income")/col("total_population"))


crime_dataset = spark.read.parquet("s3://groups-bucket-dblab-905418150721/group35/main_dataset_parquet")\
.filter((col("LON") != 0) | (col("LAT") != 0))  \
.withColumn("point",ST_Point("LON", "LAT")) \
.select("point")


# res2 = crime_dataset \
#     .join(blocks_census, ST_Within(crime_dataset.point, blocks_census.geometry),)\
# .select("COMM","POP_2010")\
# .groupBy("COMM")\
# .agg(F.sum("POP_2010").alias("TotalPopulation"),F.count("*").alias("NumberOfCrimes"))\
# .withColumn("Crimes per Person",col("NumberOfCrimes")/col("TotalPopulation"))

res2 = crime_dataset \
    .join(blocks_census.select("geometry","COMM","POP_2010"), ST_Within(crime_dataset.point, blocks_census.geometry)) \
.groupBy("geometry","COMM","POP_2010") \
.agg(F.count("*").alias("NumberOfCrimesPerBlock"))\
.groupby("COMM") \
.agg(F.sum("POP_2010").alias("TotalPopulation"),F.sum("NumberOfCrimesPerBlock").alias("NumberOfCrimes") ) \
.withColumn("Crimes per Person",col("NumberOfCrimes")/col("TotalPopulation"))

res=res1.join(res2,res1["COMM"]==res2["COMM"]).select(res1["COMM"].alias("COMM"),"Average Income","Crimes per Person").orderBy(col("Average Income").desc())

end_time = time.time()

print(f"Time taken: {end_time-start_time:.2f} seconds")



res.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Time taken: 4.02 seconds
+--------------------+------------------+--------------------+
|                COMM|    Average Income|   Crimes per Person|
+--------------------+------------------+--------------------+
|      Marina del Rey| 76428.84908639747| 0.16078790655061842|
|   Pacific Palisades| 70656.11180545464|  0.4720770986558458|
|              Malibu|  67135.0118623962|0.003460207612456...|
| Palisades Highlands| 66867.44038612054|  0.2055830941821028|
|    Marina Peninsula|65235.692875259396|  0.6549938347718866|
|             Bel Air| 63041.33942621959| 0.43199608610567514|
|Palos Verdes Estates| 61905.61214466438|0.008547008547008548|
|     Manhattan Beach|60985.189241497086|0.033648790746582544|
|       Beverly Crest| 60947.48978754819| 0.37490683229813665|
|           Brentwood| 60840.62462032012|  0.5346764258279586|
|       Hermosa Beach| 57924.85594176151|0.016216216216216217|
|   Mandeville Canyon| 55572.11011444479|  0.2716207559256887|
|La Cañada Flintridge| 54900.6

In [2]:
res.explain(True)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

== Parsed Logical Plan ==
'Sort ['Average Income DESC NULLS LAST], true
+- Project [COMM#49 AS COMM#465, Average Income#289, Crimes per Person#414]
   +- Join Inner, (COMM#49 = COMM#430)
      :- Project [COMM#49, total_population#283L, comm_total_income#285, (comm_total_income#285 / cast(total_population#283L as double)) AS Average Income#289]
      :  +- Aggregate [COMM#49], [COMM#49, sum(TOTAL_POP_2010#215L) AS total_population#283L, sum(total_income#265) AS comm_total_income#285]
      :     +- Project [Zip Code#240, Community#241, Estimated Median Income#246, COMM#49, ZCTA10#66, TOTAL_POP_2010#215L, TOTAL_HOUSING10#217L, (Estimated Median Income#246 * cast(TOTAL_HOUSING10#217L as float)) AS total_income#265]
      :        +- Join Inner, (ZCTA10#66 = Zip Code#240)
      :           :- Project [Zip Code#240, Community#241, cast(regexp_replace(Estimated Median Income#242, [$,.], , 1) as float) AS Estimated Median Income#246]
      :           :  +- Relation [Zip Code#240,Community#2

In [37]:
####query 3 HINT SHUFFLE HASH !!!!

from sedona.spark import *
from pyspark.sql.functions import col
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import time

# Create spark Session
spark = SparkSession.builder \
    .appName("GeoJSON read") \
    .getOrCreate()

# Create sedona context
sedona = SedonaContext.create(spark)
# Read the file from s3
geojson_path = "s3://initial-notebook-data-bucket-dblab-905418150721/2010_Census_Blocks.geojson"

start_time=time.time()
blocks_df = sedona.read.format("geojson") \
            .option("multiLine", "true").load(geojson_path) \
            .selectExpr("explode(features) as features") \
            .select("features.*")
# Formatting magic
blocks_census = blocks_df.select( \
                [col(f"properties.{col_name}").alias(col_name) for col_name in \
                blocks_df.schema["properties"].dataType.fieldNames()] + ["geometry"]) \
            .drop("properties") \
            .drop("type") \
            .dropna(subset=["COMM","ZCTA10"])  \


census = blocks_census \
    .select("COMM", "POP_2010", "ZCTA10", "HOUSING10") \
    .na.fill({"POP_2010": 0, "HOUSING10": 0}) \
    .groupBy("COMM", "ZCTA10") \
    .agg(
        F.sum("POP_2010").alias("TOTAL_POP_2010"),
        F.sum("HOUSING10").alias("TOTAL_HOUSING10")
    )

income= spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/LA_income_2015.csv", header=True)

res1= income.withColumn( "Estimated Median Income", F.regexp_replace(col("Estimated Median Income"), "[$,.]", "").cast("float"))\
.join(census,census["ZCTA10"]==income["Zip Code"])\
.withColumn("total_income",col("Estimated Median Income")*col("TOTAL_HOUSING10") )\
.groupBy("COMM").agg(
                    F.sum("TOTAL_POP_2010").alias("total_population"),
                    F.sum("total_income").alias("comm_total_income"))\
.withColumn("Average Income", col("comm_total_income")/col("total_population"))


crime_dataset = spark.read.parquet("s3://groups-bucket-dblab-905418150721/group35/main_dataset_parquet")\
.filter((col("LON") != 0) | (col("LAT") != 0))  \
.withColumn("point",ST_Point("LON", "LAT")) \
.select("point")


# res2 = crime_dataset \
#     .join(blocks_census, ST_Within(crime_dataset.point, blocks_census.geometry),)\
# .select("COMM","POP_2010")\
# .groupBy("COMM")\
# .agg(F.sum("POP_2010").alias("TotalPopulation"),F.count("*").alias("NumberOfCrimes"))\
# .withColumn("Crimes per Person",col("NumberOfCrimes")/col("TotalPopulation"))

res2 = crime_dataset \
    .join(blocks_census.select("geometry","COMM","POP_2010"), ST_Within(crime_dataset.point, blocks_census.geometry)) \
.groupBy("geometry","COMM","POP_2010") \
.agg(F.count("*").alias("NumberOfCrimesPerBlock"))\
.groupby("COMM") \
.agg(F.sum("POP_2010").alias("TotalPopulation"),F.sum("NumberOfCrimesPerBlock").alias("NumberOfCrimes") ) \
.withColumn("Crimes per Person",col("NumberOfCrimes")/col("TotalPopulation"))

res=res1.join(res2.hint("SHUFFLE_HASH"),res1["COMM"]==res2["COMM"]).select(res1["COMM"].alias("COMM"),"Average Income","Crimes per Person").orderBy(col("Average Income").desc())

end_time = time.time()

print(f"Time taken: {end_time-start_time:.2f} seconds")



res.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Time taken: 2.77 seconds
+--------------------+------------------+--------------------+
|                COMM|    Average Income|   Crimes per Person|
+--------------------+------------------+--------------------+
|      Marina del Rey| 76428.84908639747| 0.16078790655061842|
|   Pacific Palisades| 70656.11180545464|  0.4720770986558458|
|              Malibu|  67135.0118623962|0.003460207612456...|
| Palisades Highlands| 66867.44038612054|  0.2055830941821028|
|    Marina Peninsula|65235.692875259396|  0.6549938347718866|
|             Bel Air| 63041.33942621959| 0.43199608610567514|
|Palos Verdes Estates| 61905.61214466438|0.008547008547008548|
|     Manhattan Beach|60985.189241497086|0.033648790746582544|
|       Beverly Crest| 60947.48978754819| 0.37490683229813665|
|           Brentwood| 60840.62462032012|  0.5346764258279586|
|       Hermosa Beach| 57924.85594176151|0.016216216216216217|
|   Mandeville Canyon| 55572.11011444479|  0.2716207559256887|
|La Cañada Flintridge| 54900.6

In [24]:
res.explain(True)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

== Parsed Logical Plan ==
'Sort ['Average Income DESC NULLS LAST], true
+- Project [COMM#6843 AS COMM#7259, Average Income#7083, Crimes per Person#7208]
   +- Join Inner, (COMM#6843 = COMM#7224)
      :- Project [COMM#6843, total_population#7077L, comm_total_income#7079, (comm_total_income#7079 / cast(total_population#7077L as double)) AS Average Income#7083]
      :  +- Aggregate [COMM#6843], [COMM#6843, sum(TOTAL_POP_2010#7009L) AS total_population#7077L, sum(total_income#7059) AS comm_total_income#7079]
      :     +- Project [Zip Code#7034, Community#7035, Estimated Median Income#7040, COMM#6843, ZCTA10#6860, TOTAL_POP_2010#7009L, TOTAL_HOUSING10#7011L, (Estimated Median Income#7040 * cast(TOTAL_HOUSING10#7011L as float)) AS total_income#7059]
      :        +- Join Inner, (ZCTA10#6860 = Zip Code#7034)
      :           :- Project [Zip Code#7034, Community#7035, cast(regexp_replace(Estimated Median Income#7036, [$,.], , 1) as float) AS Estimated Median Income#7040]
      :         

In [42]:
####query 3 HINT BROADCAST

from sedona.spark import *
from pyspark.sql.functions import col
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import time

# Create spark Session
spark = SparkSession.builder \
    .appName("GeoJSON read") \
    .getOrCreate()

# Create sedona context
sedona = SedonaContext.create(spark)
# Read the file from s3
geojson_path = "s3://initial-notebook-data-bucket-dblab-905418150721/2010_Census_Blocks.geojson"

start_time=time.time()
blocks_df = sedona.read.format("geojson") \
            .option("multiLine", "true").load(geojson_path) \
            .selectExpr("explode(features) as features") \
            .select("features.*")
# Formatting magic
blocks_census = blocks_df.select( \
                [col(f"properties.{col_name}").alias(col_name) for col_name in \
                blocks_df.schema["properties"].dataType.fieldNames()] + ["geometry"]) \
            .drop("properties") \
            .drop("type") \
            .dropna(subset=["COMM","ZCTA10"])  \


census = blocks_census \
    .select("COMM", "POP_2010", "ZCTA10", "HOUSING10") \
    .na.fill({"POP_2010": 0, "HOUSING10": 0}) \
    .groupBy("COMM", "ZCTA10") \
    .agg(
        F.sum("POP_2010").alias("TOTAL_POP_2010"),
        F.sum("HOUSING10").alias("TOTAL_HOUSING10")
    )

income= spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/LA_income_2015.csv", header=True)

res1= income.withColumn( "Estimated Median Income", F.regexp_replace(col("Estimated Median Income"), "[$,.]", "").cast("float"))\
.join(census,census["ZCTA10"]==income["Zip Code"])\
.withColumn("total_income",col("Estimated Median Income")*col("TOTAL_HOUSING10") )\
.groupBy("COMM").agg(
                    F.sum("TOTAL_POP_2010").alias("total_population"),
                    F.sum("total_income").alias("comm_total_income"))\
.withColumn("Average Income", col("comm_total_income")/col("total_population"))


crime_dataset = spark.read.parquet("s3://groups-bucket-dblab-905418150721/group35/main_dataset_parquet")\
.filter((col("LON") != 0) | (col("LAT") != 0))  \
.withColumn("point",ST_Point("LON", "LAT")) \
.select("point")


# res2 = crime_dataset \
#     .join(blocks_census, ST_Within(crime_dataset.point, blocks_census.geometry),)\
# .select("COMM","POP_2010")\
# .groupBy("COMM")\
# .agg(F.sum("POP_2010").alias("TotalPopulation"),F.count("*").alias("NumberOfCrimes"))\
# .withColumn("Crimes per Person",col("NumberOfCrimes")/col("TotalPopulation"))

res2 = crime_dataset \
    .join(blocks_census.select("geometry","COMM","POP_2010"), ST_Within(crime_dataset.point, blocks_census.geometry)) \
.groupBy("geometry","COMM","POP_2010") \
.agg(F.count("*").alias("NumberOfCrimesPerBlock"))\
.groupby("COMM") \
.agg(F.sum("POP_2010").alias("TotalPopulation"),F.sum("NumberOfCrimesPerBlock").alias("NumberOfCrimes") ) \
.withColumn("Crimes per Person",col("NumberOfCrimes")/col("TotalPopulation"))

res=res1.join(res2.hint("BROADCAST"),res1["COMM"]==res2["COMM"]).select(res1["COMM"].alias("COMM"),"Average Income","Crimes per Person").orderBy(col("Average Income").desc())

end_time = time.time()

print(f"Time taken: {end_time-start_time:.2f} seconds")



res.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Time taken: 2.73 seconds
+--------------------+------------------+--------------------+
|                COMM|    Average Income|   Crimes per Person|
+--------------------+------------------+--------------------+
|      Marina del Rey| 76428.84908639747| 0.16078790655061842|
|   Pacific Palisades| 70656.11180545464|  0.4720770986558458|
|              Malibu|  67135.0118623962|0.003460207612456...|
| Palisades Highlands| 66867.44038612054|  0.2055830941821028|
|    Marina Peninsula|65235.692875259396|  0.6549938347718866|
|             Bel Air| 63041.33942621959| 0.43199608610567514|
|Palos Verdes Estates| 61905.61214466438|0.008547008547008548|
|     Manhattan Beach|60985.189241497086|0.033648790746582544|
|       Beverly Crest| 60947.48978754819| 0.37490683229813665|
|           Brentwood| 60840.62462032012|  0.5346764258279586|
|       Hermosa Beach| 57924.85594176151|0.016216216216216217|
|   Mandeville Canyon| 55572.11011444479|  0.2716207559256887|
|La Cañada Flintridge| 54900.6

In [18]:
res.explain(True)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

== Parsed Logical Plan ==
'Sort ['Average Income DESC NULLS LAST], true
+- Project [COMM#5133 AS COMM#5549, Average Income#5373, Crimes per Person#5498]
   +- Join Inner, (COMM#5133 = COMM#5514)
      :- Project [COMM#5133, total_population#5367L, comm_total_income#5369, (comm_total_income#5369 / cast(total_population#5367L as double)) AS Average Income#5373]
      :  +- Aggregate [COMM#5133], [COMM#5133, sum(TOTAL_POP_2010#5299L) AS total_population#5367L, sum(total_income#5349) AS comm_total_income#5369]
      :     +- Project [Zip Code#5324, Community#5325, Estimated Median Income#5330, COMM#5133, ZCTA10#5150, TOTAL_POP_2010#5299L, TOTAL_HOUSING10#5301L, (Estimated Median Income#5330 * cast(TOTAL_HOUSING10#5301L as float)) AS total_income#5349]
      :        +- Join Inner, (ZCTA10#5150 = Zip Code#5324)
      :           :- Project [Zip Code#5324, Community#5325, cast(regexp_replace(Estimated Median Income#5326, [$,.], , 1) as float) AS Estimated Median Income#5330]
      :         

In [31]:
####query 3 HINT SHUFFLE REPLICATE NL

from sedona.spark import *
from pyspark.sql.functions import col
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import time

# Create spark Session
spark = SparkSession.builder \
    .appName("GeoJSON read") \
    .getOrCreate()

# Create sedona context
sedona = SedonaContext.create(spark)
# Read the file from s3
geojson_path = "s3://initial-notebook-data-bucket-dblab-905418150721/2010_Census_Blocks.geojson"

start_time=time.time()
blocks_df = sedona.read.format("geojson") \
            .option("multiLine", "true").load(geojson_path) \
            .selectExpr("explode(features) as features") \
            .select("features.*")
# Formatting magic
blocks_census = blocks_df.select( \
                [col(f"properties.{col_name}").alias(col_name) for col_name in \
                blocks_df.schema["properties"].dataType.fieldNames()] + ["geometry"]) \
            .drop("properties") \
            .drop("type") \
            .dropna(subset=["COMM","ZCTA10"])  \


census = blocks_census \
    .select("COMM", "POP_2010", "ZCTA10", "HOUSING10") \
    .na.fill({"POP_2010": 0, "HOUSING10": 0}) \
    .groupBy("COMM", "ZCTA10") \
    .agg(
        F.sum("POP_2010").alias("TOTAL_POP_2010"),
        F.sum("HOUSING10").alias("TOTAL_HOUSING10")
    )

income= spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/LA_income_2015.csv", header=True)

res1= income.withColumn( "Estimated Median Income", F.regexp_replace(col("Estimated Median Income"), "[$,.]", "").cast("float"))\
.join(census,census["ZCTA10"]==income["Zip Code"])\
.withColumn("total_income",col("Estimated Median Income")*col("TOTAL_HOUSING10") )\
.groupBy("COMM").agg(
                    F.sum("TOTAL_POP_2010").alias("total_population"),
                    F.sum("total_income").alias("comm_total_income"))\
.withColumn("Average Income", col("comm_total_income")/col("total_population"))


crime_dataset = spark.read.parquet("s3://groups-bucket-dblab-905418150721/group35/main_dataset_parquet")\
.filter((col("LON") != 0) | (col("LAT") != 0))  \
.withColumn("point",ST_Point("LON", "LAT")) \
.select("point")


# res2 = crime_dataset \
#     .join(blocks_census, ST_Within(crime_dataset.point, blocks_census.geometry),)\
# .select("COMM","POP_2010")\
# .groupBy("COMM")\
# .agg(F.sum("POP_2010").alias("TotalPopulation"),F.count("*").alias("NumberOfCrimes"))\
# .withColumn("Crimes per Person",col("NumberOfCrimes")/col("TotalPopulation"))

res2 = crime_dataset \
    .join(blocks_census.select("geometry","COMM","POP_2010"), ST_Within(crime_dataset.point, blocks_census.geometry)) \
.groupBy("geometry","COMM","POP_2010") \
.agg(F.count("*").alias("NumberOfCrimesPerBlock"))\
.groupby("COMM") \
.agg(F.sum("POP_2010").alias("TotalPopulation"),F.sum("NumberOfCrimesPerBlock").alias("NumberOfCrimes") ) \
.withColumn("Crimes per Person",col("NumberOfCrimes")/col("TotalPopulation"))

res=res1.join(res2.hint("SHUFFLE_REPLICATE_NL"),res1["COMM"]==res2["COMM"]).select(res1["COMM"].alias("COMM"),"Average Income","Crimes per Person").orderBy(col("Average Income").desc())

end_time = time.time()

print(f"Time taken: {end_time-start_time:.2f} seconds")



res.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Time taken: 5.05 seconds
+--------------------+------------------+--------------------+
|                COMM|    Average Income|   Crimes per Person|
+--------------------+------------------+--------------------+
|      Marina del Rey| 76428.84908639747| 0.16078790655061842|
|   Pacific Palisades| 70656.11180545464|  0.4720770986558458|
|              Malibu|  67135.0118623962|0.003460207612456...|
| Palisades Highlands| 66867.44038612054|  0.2055830941821028|
|    Marina Peninsula|65235.692875259396|  0.6549938347718866|
|             Bel Air| 63041.33942621959| 0.43199608610567514|
|Palos Verdes Estates| 61905.61214466438|0.008547008547008548|
|     Manhattan Beach|60985.189241497086|0.033648790746582544|
|       Beverly Crest| 60947.48978754819| 0.37490683229813665|
|           Brentwood| 60840.62462032012|  0.5346764258279586|
|       Hermosa Beach| 57924.85594176151|0.016216216216216217|
|   Mandeville Canyon| 55572.11011444479|  0.2716207559256887|
|La Cañada Flintridge| 54900.6

In [32]:
res.explain(True)

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

== Parsed Logical Plan ==
'Sort ['Average Income DESC NULLS LAST], true
+- Project [COMM#9637 AS COMM#10053, Average Income#9877, Crimes per Person#10002]
   +- Join Inner, (COMM#9637 = COMM#10018)
      :- Project [COMM#9637, total_population#9871L, comm_total_income#9873, (comm_total_income#9873 / cast(total_population#9871L as double)) AS Average Income#9877]
      :  +- Aggregate [COMM#9637], [COMM#9637, sum(TOTAL_POP_2010#9803L) AS total_population#9871L, sum(total_income#9853) AS comm_total_income#9873]
      :     +- Project [Zip Code#9828, Community#9829, Estimated Median Income#9834, COMM#9637, ZCTA10#9654, TOTAL_POP_2010#9803L, TOTAL_HOUSING10#9805L, (Estimated Median Income#9834 * cast(TOTAL_HOUSING10#9805L as float)) AS total_income#9853]
      :        +- Join Inner, (ZCTA10#9654 = Zip Code#9828)
      :           :- Project [Zip Code#9828, Community#9829, cast(regexp_replace(Estimated Median Income#9830, [$,.], , 1) as float) AS Estimated Median Income#9834]
      :      

In [23]:
# query 4 conf 1 max income
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from sedona.spark import *
from pyspark.sql.functions import col,year

import time

spark = SparkSession \
    .builder \
    .config("spark.executor.instances", "2")\
    .config("spark.executor.cores", "1")\
    .config("spark.executor.memory", "2g")\
    .appName("Query 4") \
    .getOrCreate()

sedona = SedonaContext.create(spark)
    
start_time=time.time()

#geojson handling
geojson_path = "s3://initial-notebook-data-bucket-dblab-905418150721/2010_Census_Blocks.geojson"

#the communities we want
target_comm_values = ["Marina del Rey","Pacific Palisades","Malibu"]

blocks_df = sedona.read.format("geojson") \
            .option("multiLine", "true").load(geojson_path) \
            .selectExpr("explode(features) as features") \
            .select("features.*")

flattened_df = blocks_df.select( \
                [F.col(f"properties.{col_name}").alias(col_name) for col_name in \
                blocks_df.schema["properties"].dataType.fieldNames()] + ["geometry"]) \
            .drop("properties") \
            .drop("type")\
            .filter(F.col("COMM").isin(target_comm_values))

#Crime data handling in the areas of interest
dataframe = spark.read.parquet("s3://groups-bucket-dblab-905418150721/group35/main_dataset_parquet") \
    .filter(col("DATE OCC").substr(7,4) == "2015") \
.select("Vict Descent","LAT","LON")
#Creation of geometry type column
dataframe = dataframe\
    .withColumn("geom", ST_Point("LON","LAT"))

#keep data related to the specific areas
dataframe = dataframe\
    .join(flattened_df, ST_Within(dataframe.geom, flattened_df.geometry), "inner")\
    .select("Vict Descent")

#Ethnic-Race dataframe handling
ethnic_df = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/RE_codes.csv", header=True, inferSchema=True)\
    .withColumnRenamed("Vict Descent", "Vict Descent right")

#Conversion of ethnic codes to full Name of Race and count for each group the number of victs
result_df = dataframe \
    .join(ethnic_df, dataframe["Vict Descent"] == ethnic_df["Vict Descent right"], how="left")\
    .withColumn("Vict Descent", F.col("Vict Descent Full"))\
    .select("Vict Descent")\
    .withColumnRenamed("Vict Descent", "Victim Descent")\
    .groupBy("Victim Descent").agg(F.count("*").alias("#"))\
    .orderBy("#",ascending=False)
    
result_df.show()

end_time = time.time()

print(f"Time taken: {end_time-start_time:.2f} seconds")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+---+
|      Victim Descent|  #|
+--------------------+---+
|               White|544|
|               Other| 73|
|Hispanic/Latin/Me...| 60|
|             Unknown| 41|
|               Black| 37|
|                NULL| 25|
|         Other Asian| 15|
|             Chinese|  1|
|American Indian/A...|  1|
+--------------------+---+

Time taken: 15.70 seconds

In [24]:
# query 4 conf 2 max income
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from sedona.spark import *
from pyspark.sql.functions import col,year

import time

spark = SparkSession \
    .builder \
    .config("spark.executor.instances", "2")\
    .config("spark.executor.cores", "2")\
    .config("spark.executor.memory", "4g")\
    .appName("Query 4") \
    .getOrCreate()

sedona = SedonaContext.create(spark)
    
start_time=time.time()

#geojson handling
geojson_path = "s3://initial-notebook-data-bucket-dblab-905418150721/2010_Census_Blocks.geojson"

#the communities we want
target_comm_values = ["Marina del Rey","Pacific Palisades","Malibu"]

blocks_df = sedona.read.format("geojson") \
            .option("multiLine", "true").load(geojson_path) \
            .selectExpr("explode(features) as features") \
            .select("features.*")

flattened_df = blocks_df.select( \
                [F.col(f"properties.{col_name}").alias(col_name) for col_name in \
                blocks_df.schema["properties"].dataType.fieldNames()] + ["geometry"]) \
            .drop("properties") \
            .drop("type")\
            .filter(F.col("COMM").isin(target_comm_values))

#Crime data handling in the areas of interest
dataframe = spark.read.parquet("s3://groups-bucket-dblab-905418150721/group35/main_dataset_parquet") \
    .filter(col("DATE OCC").substr(7,4) == "2015") \
.select("Vict Descent","LAT","LON")
#Creation of geometry type column
dataframe = dataframe\
    .withColumn("geom", ST_Point("LON","LAT"))

#keep data related to the specific areas
dataframe = dataframe\
    .join(flattened_df, ST_Within(dataframe.geom, flattened_df.geometry), "inner")\
    .select("Vict Descent")

#Ethnic-Race dataframe handling
ethnic_df = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/RE_codes.csv", header=True, inferSchema=True)\
    .withColumnRenamed("Vict Descent", "Vict Descent right")

#Conversion of ethnic codes to full Name of Race and count for each group the number of victs
result_df = dataframe \
    .join(ethnic_df, dataframe["Vict Descent"] == ethnic_df["Vict Descent right"], how="left")\
    .withColumn("Vict Descent", F.col("Vict Descent Full"))\
    .select("Vict Descent")\
    .withColumnRenamed("Vict Descent", "Victim Descent")\
    .groupBy("Victim Descent").agg(F.count("*").alias("#"))\
    .orderBy("#",ascending=False)
    
result_df.show()

end_time = time.time()

print(f"Time taken: {end_time-start_time:.2f} seconds")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+---+
|      Victim Descent|  #|
+--------------------+---+
|               White|544|
|               Other| 73|
|Hispanic/Latin/Me...| 60|
|             Unknown| 41|
|               Black| 37|
|                NULL| 25|
|         Other Asian| 15|
|             Chinese|  1|
|American Indian/A...|  1|
+--------------------+---+

Time taken: 15.72 seconds

In [25]:
# query 4 conf 3 max income
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from sedona.spark import *
from pyspark.sql.functions import col,year

import time

spark = SparkSession \
    .builder \
    .config("spark.executor.instances", "2")\
    .config("spark.executor.cores", "8")\
    .config("spark.executor.memory", "8g")\
    .appName("Query 4") \
    .getOrCreate()

sedona = SedonaContext.create(spark)
    
start_time=time.time()

#geojson handling
geojson_path = "s3://initial-notebook-data-bucket-dblab-905418150721/2010_Census_Blocks.geojson"

#the communities we want
target_comm_values = ["Marina del Rey","Pacific Palisades","Malibu"]

blocks_df = sedona.read.format("geojson") \
            .option("multiLine", "true").load(geojson_path) \
            .selectExpr("explode(features) as features") \
            .select("features.*")

flattened_df = blocks_df.select( \
                [F.col(f"properties.{col_name}").alias(col_name) for col_name in \
                blocks_df.schema["properties"].dataType.fieldNames()] + ["geometry"]) \
            .drop("properties") \
            .drop("type")\
            .filter(F.col("COMM").isin(target_comm_values))

#Crime data handling in the areas of interest
dataframe = spark.read.parquet("s3://groups-bucket-dblab-905418150721/group35/main_dataset_parquet") \
    .filter(col("DATE OCC").substr(7,4) == "2015") \
.select("Vict Descent","LAT","LON")
#Creation of geometry type column
dataframe = dataframe\
    .withColumn("geom", ST_Point("LON","LAT"))

#keep data related to the specific areas
dataframe = dataframe\
    .join(flattened_df, ST_Within(dataframe.geom, flattened_df.geometry), "inner")\
    .select("Vict Descent")

#Ethnic-Race dataframe handling
ethnic_df = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/RE_codes.csv", header=True, inferSchema=True)\
    .withColumnRenamed("Vict Descent", "Vict Descent right")

#Conversion of ethnic codes to full Name of Race and count for each group the number of victs
result_df = dataframe \
    .join(ethnic_df, dataframe["Vict Descent"] == ethnic_df["Vict Descent right"], how="left")\
    .withColumn("Vict Descent", F.col("Vict Descent Full"))\
    .select("Vict Descent")\
    .withColumnRenamed("Vict Descent", "Victim Descent")\
    .groupBy("Victim Descent").agg(F.count("*").alias("#"))\
    .orderBy("#",ascending=False)
    
result_df.show()

end_time = time.time()

print(f"Time taken: {end_time-start_time:.2f} seconds")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+---+
|      Victim Descent|  #|
+--------------------+---+
|               White|544|
|               Other| 73|
|Hispanic/Latin/Me...| 60|
|             Unknown| 41|
|               Black| 37|
|                NULL| 25|
|         Other Asian| 15|
|             Chinese|  1|
|American Indian/A...|  1|
+--------------------+---+

Time taken: 15.38 seconds

In [15]:
####query 3 calculate lowest income communities!!!!!

from sedona.spark import *
from pyspark.sql.functions import col
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
import time

# Create spark Session
spark = SparkSession.builder \
    .appName("GeoJSON read") \
    .getOrCreate()

# Create sedona context
sedona = SedonaContext.create(spark)
# Read the file from s3
geojson_path = "s3://initial-notebook-data-bucket-dblab-905418150721/2010_Census_Blocks.geojson"

start_time=time.time()
blocks_df = sedona.read.format("geojson") \
            .option("multiLine", "true").load(geojson_path) \
            .selectExpr("explode(features) as features") \
            .select("features.*")
# Formatting magic
blocks_census = blocks_df.select( \
                [col(f"properties.{col_name}").alias(col_name) for col_name in \
                blocks_df.schema["properties"].dataType.fieldNames()] + ["geometry"]) \
            .drop("properties") \
            .drop("type") \
            .dropna(subset=["COMM","ZCTA10"])  \


census = blocks_census \
    .select("COMM", "POP_2010", "ZCTA10", "HOUSING10") \
    .na.fill({"POP_2010": 0, "HOUSING10": 0}) \
    .groupBy("COMM", "ZCTA10") \
    .agg(
        F.sum("POP_2010").alias("TOTAL_POP_2010"),
        F.sum("HOUSING10").alias("TOTAL_HOUSING10")
    )

income= spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/LA_income_2015.csv", header=True)

res1= income.withColumn( "Estimated Median Income", F.regexp_replace(col("Estimated Median Income"), "[$,.]", "").cast("float"))\
.join(census,census["ZCTA10"]==income["Zip Code"])\
.withColumn("total_income",col("Estimated Median Income")*col("TOTAL_HOUSING10") )\
.groupBy("COMM").agg(
                    F.sum("TOTAL_POP_2010").alias("total_population"),
                    F.sum("total_income").alias("comm_total_income"))\
.withColumn("Average Income", col("comm_total_income")/col("total_population"))


crime_dataset = spark.read.parquet("s3://groups-bucket-dblab-905418150721/group35/main_dataset_parquet")\
.filter((col("LON") != 0) | (col("LAT") != 0))  \
.withColumn("point",ST_Point("LON", "LAT")) \
.select("point")


# res2 = crime_dataset \
#     .join(blocks_census, ST_Within(crime_dataset.point, blocks_census.geometry),)\
# .select("COMM","POP_2010")\
# .groupBy("COMM")\
# .agg(F.sum("POP_2010").alias("TotalPopulation"),F.count("*").alias("NumberOfCrimes"))\
# .withColumn("Crimes per Person",col("NumberOfCrimes")/col("TotalPopulation"))

res2 = crime_dataset \
    .join(blocks_census.select("geometry","COMM","POP_2010"), ST_Within(crime_dataset.point, blocks_census.geometry)) \
.groupBy("geometry","COMM","POP_2010") \
.agg(F.count("*").alias("NumberOfCrimesPerBlock"))\
.groupby("COMM") \
.agg(F.sum("POP_2010").alias("TotalPopulation"),F.sum("NumberOfCrimesPerBlock").alias("NumberOfCrimes") ) \
.withColumn("Crimes per Person",col("NumberOfCrimes")/col("TotalPopulation"))

res=res1.join(res2,res1["COMM"]==res2["COMM"]).select(res1["COMM"].alias("COMM"),"Average Income","Crimes per Person").orderBy(col("Average Income").asc())

end_time = time.time()

print(f"Time taken: {end_time-start_time:.2f} seconds")



res.show()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Time taken: 3.09 seconds
+------------------+------------------+--------------------+
|              COMM|    Average Income|   Crimes per Person|
+------------------+------------------+--------------------+
|            Vernon| 4406.446428571428|                NULL|
|    Vernon Central| 6624.199339771586|  0.7617435282016115|
|   University Park| 6863.013283954593|  1.0671493587091436|
|        South Park| 6943.255677324596|  0.9406721227928171|
|           Central| 6972.518209022642|  0.7312082894274675|
|Wholesale District|7728.6860264829365|   2.229077179830921|
|             Watts|  7754.24222709736|  1.0362929698784107|
|Florence-Firestone| 7859.978450089285|  0.9326086563330861|
|     Green Meadows| 8027.096463346955|  1.4156235238545112|
|    Athens Village| 8294.286744092911| 0.02976190476190476|
|    Vermont Square| 8329.565933286018|  0.8801749726605218|
|     Vermont Vista|  8361.68308921438|  1.4987153441682601|
|       Walnut Park| 8422.475760992109|0.004938271604938...|

In [28]:
# query 4 conf 1 min income
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from sedona.spark import *
from pyspark.sql.functions import col,year

import time

spark = SparkSession \
    .builder \
    .config("spark.executor.instances", "2")\
    .config("spark.executor.cores", "1")\
    .config("spark.executor.memory", "2g")\
    .appName("Query 4") \
    .getOrCreate()

sedona = SedonaContext.create(spark)
    
start_time=time.time()

#geojson handling
geojson_path = "s3://initial-notebook-data-bucket-dblab-905418150721/2010_Census_Blocks.geojson"

#the communities we want
target_comm_values = ["Vernon","Vernon Central","University Park"]

blocks_df = sedona.read.format("geojson") \
            .option("multiLine", "true").load(geojson_path) \
            .selectExpr("explode(features) as features") \
            .select("features.*")

flattened_df = blocks_df.select( \
                [F.col(f"properties.{col_name}").alias(col_name) for col_name in \
                blocks_df.schema["properties"].dataType.fieldNames()] + ["geometry"]) \
            .drop("properties") \
            .drop("type")\
            .filter(F.col("COMM").isin(target_comm_values))

#Crime data handling in the areas of interest
dataframe = spark.read.parquet("s3://groups-bucket-dblab-905418150721/group35/main_dataset_parquet") \
    .filter(col("DATE OCC").substr(7,4) == "2015") \
.select("Vict Descent","LAT","LON")
#Creation of geometry type column
dataframe = dataframe\
    .withColumn("geom", ST_Point("LON","LAT"))

#keep data related to the specific areas
dataframe = dataframe\
    .join(flattened_df, ST_Within(dataframe.geom, flattened_df.geometry), "inner")\
    .select("Vict Descent")

#Ethnic-Race dataframe handling
ethnic_df = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/RE_codes.csv", header=True, inferSchema=True)\
    .withColumnRenamed("Vict Descent", "Vict Descent right")

#Conversion of ethnic codes to full Name of Race and count for each group the number of victs
result_df = dataframe \
    .join(ethnic_df, dataframe["Vict Descent"] == ethnic_df["Vict Descent right"], how="left")\
    .withColumn("Vict Descent", F.col("Vict Descent Full"))\
    .select("Vict Descent")\
    .withColumnRenamed("Vict Descent", "Victim Descent")\
    .groupBy("Victim Descent").agg(F.count("*").alias("#"))\
    .orderBy("#",ascending=False)
    
result_df.show()

end_time = time.time()

print(f"Time taken: {end_time-start_time:.2f} seconds")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+----+
|      Victim Descent|   #|
+--------------------+----+
|Hispanic/Latin/Me...|2113|
|               Black| 651|
|                NULL| 414|
|               White| 389|
|               Other| 192|
|         Other Asian| 137|
|             Unknown|  26|
|American Indian/A...|  23|
|              Korean|   4|
|             Chinese|   3|
|            Filipino|   2|
|         AsianIndian|   1|
+--------------------+----+

Time taken: 15.24 seconds

In [27]:
# query 4 conf 2 min income
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from sedona.spark import *
from pyspark.sql.functions import col,year

import time

spark = SparkSession \
    .builder \
    .config("spark.executor.instances", "2")\
    .config("spark.executor.cores", "2")\
    .config("spark.executor.memory", "4g")\
    .appName("Query 4") \
    .getOrCreate()

sedona = SedonaContext.create(spark)
    
start_time=time.time()

#geojson handling
geojson_path = "s3://initial-notebook-data-bucket-dblab-905418150721/2010_Census_Blocks.geojson"

#the communities we want
target_comm_values = ["Vernon","Vernon Central","University Park"]

blocks_df = sedona.read.format("geojson") \
            .option("multiLine", "true").load(geojson_path) \
            .selectExpr("explode(features) as features") \
            .select("features.*")

flattened_df = blocks_df.select( \
                [F.col(f"properties.{col_name}").alias(col_name) for col_name in \
                blocks_df.schema["properties"].dataType.fieldNames()] + ["geometry"]) \
            .drop("properties") \
            .drop("type")\
            .filter(F.col("COMM").isin(target_comm_values))

#Crime data handling in the areas of interest
dataframe = spark.read.parquet("s3://groups-bucket-dblab-905418150721/group35/main_dataset_parquet") \
    .filter(col("DATE OCC").substr(7,4) == "2015") \
.select("Vict Descent","LAT","LON")
#Creation of geometry type column
dataframe = dataframe\
    .withColumn("geom", ST_Point("LON","LAT"))

#keep data related to the specific areas
dataframe = dataframe\
    .join(flattened_df, ST_Within(dataframe.geom, flattened_df.geometry), "inner")\
    .select("Vict Descent")

#Ethnic-Race dataframe handling
ethnic_df = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/RE_codes.csv", header=True, inferSchema=True)\
    .withColumnRenamed("Vict Descent", "Vict Descent right")

#Conversion of ethnic codes to full Name of Race and count for each group the number of victs
result_df = dataframe \
    .join(ethnic_df, dataframe["Vict Descent"] == ethnic_df["Vict Descent right"], how="left")\
    .withColumn("Vict Descent", F.col("Vict Descent Full"))\
    .select("Vict Descent")\
    .withColumnRenamed("Vict Descent", "Victim Descent")\
    .groupBy("Victim Descent").agg(F.count("*").alias("#"))\
    .orderBy("#",ascending=False)
    
result_df.show()

end_time = time.time()

print(f"Time taken: {end_time-start_time:.2f} seconds")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+----+
|      Victim Descent|   #|
+--------------------+----+
|Hispanic/Latin/Me...|2113|
|               Black| 651|
|                NULL| 414|
|               White| 389|
|               Other| 192|
|         Other Asian| 137|
|             Unknown|  26|
|American Indian/A...|  23|
|              Korean|   4|
|             Chinese|   3|
|            Filipino|   2|
|         AsianIndian|   1|
+--------------------+----+

Time taken: 15.88 seconds

In [26]:
# query 4 conf 3 min income
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from sedona.spark import *
from pyspark.sql.functions import col,year

import time

spark = SparkSession \
    .builder \
    .config("spark.executor.instances", "2")\
    .config("spark.executor.cores", "4")\
    .config("spark.executor.memory", "8g")\
    .appName("Query 4") \
    .getOrCreate()

sedona = SedonaContext.create(spark)
    
start_time=time.time()

#geojson handling
geojson_path = "s3://initial-notebook-data-bucket-dblab-905418150721/2010_Census_Blocks.geojson"

#the communities we want
target_comm_values = ["Vernon","Vernon Central","University Park"]

blocks_df = sedona.read.format("geojson") \
            .option("multiLine", "true").load(geojson_path) \
            .selectExpr("explode(features) as features") \
            .select("features.*")

flattened_df = blocks_df.select( \
                [F.col(f"properties.{col_name}").alias(col_name) for col_name in \
                blocks_df.schema["properties"].dataType.fieldNames()] + ["geometry"]) \
            .drop("properties") \
            .drop("type")\
            .filter(F.col("COMM").isin(target_comm_values))

#Crime data handling in the areas of interest
dataframe = spark.read.parquet("s3://groups-bucket-dblab-905418150721/group35/main_dataset_parquet") \
    .filter(col("DATE OCC").substr(7,4) == "2015") \
.select("Vict Descent","LAT","LON")
#Creation of geometry type column
dataframe = dataframe\
    .withColumn("geom", ST_Point("LON","LAT"))

#keep data related to the specific areas
dataframe = dataframe\
    .join(flattened_df, ST_Within(dataframe.geom, flattened_df.geometry), "inner")\
    .select("Vict Descent")

#Ethnic-Race dataframe handling
ethnic_df = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/RE_codes.csv", header=True, inferSchema=True)\
    .withColumnRenamed("Vict Descent", "Vict Descent right")

#Conversion of ethnic codes to full Name of Race and count for each group the number of victs
result_df = dataframe \
    .join(ethnic_df, dataframe["Vict Descent"] == ethnic_df["Vict Descent right"], how="left")\
    .withColumn("Vict Descent", F.col("Vict Descent Full"))\
    .select("Vict Descent")\
    .withColumnRenamed("Vict Descent", "Victim Descent")\
    .groupBy("Victim Descent").agg(F.count("*").alias("#"))\
    .orderBy("#",ascending=False)
    
result_df.show()

end_time = time.time()

print(f"Time taken: {end_time-start_time:.2f} seconds")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+--------------------+----+
|      Victim Descent|   #|
+--------------------+----+
|Hispanic/Latin/Me...|2113|
|               Black| 651|
|                NULL| 414|
|               White| 389|
|               Other| 192|
|         Other Asian| 137|
|             Unknown|  26|
|American Indian/A...|  23|
|              Korean|   4|
|             Chinese|   3|
|            Filipino|   2|
|         AsianIndian|   1|
+--------------------+----+

Time taken: 15.64 seconds

In [30]:
# query 5 conf 1

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import col
from sedona.spark import *
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number
import time

spark = SparkSession \
    .builder \
    .config("spark.executor.instances", "2")\
    .config("spark.executor.cores", "4")\
    .config("spark.executor.memory", "8g")\
    .appName("Query 5") \
    .getOrCreate()
        
sedona = SedonaContext.create(spark)

start_time=time.time()

#Precint data handling
precincts_df = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/LA_Police_Stations.csv", header=True)\
    .withColumnRenamed("DIVISION","division")\
    .withColumn("geom", ST_Point(F.col("X"),F.col("Y")))\
    .select("division","geom")


#Crime data handling
crime_df = spark.read.parquet("s3://groups-bucket-dblab-905418150721/group35/main_dataset_parquet") \
    .filter((col("LON") != 0) | (col("LAT") != 0))  \
    .withColumn("crime_geom", ST_Point(F.col("LON"),F.col("LAT")))\
    .select("crime_geom")

#Calculating the distance of each case with each precinct
joined_df = precincts_df.crossJoin(crime_df) \
    .withColumn("distance", ST_DistanceSphere(col("geom"), F.col("crime_geom")))

#Finding the closest precinct to each crime
window_spec = Window.partitionBy("crime_geom").orderBy(F.col("distance"))

closest_division_df = joined_df.withColumn("rank", row_number().over(window_spec))\
    .filter(F.col("rank") == 1) \
    .select("division", "crime_geom", "distance") 
    
result_df = closest_division_df.groupBy("division") \
    .agg(
        F.avg("distance").alias("avg_distance"),
        F.count("crime_geom").alias("#"),
    )\
    .orderBy("#",ascending=False)    
 
result_df.show()

end_time = time.time()

print(f"Time taken: {end_time-start_time:.2f} seconds")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------------+------------------+----+
|        division|      avg_distance|   #|
+----------------+------------------+----+
|         PACIFIC|3617.7734337150914|7654|
|        FOOTHILL| 4811.394973345948|7318|
|     WEST VALLEY|3363.5469483212823|6469|
|        VAN NUYS| 3430.765144800873|6453|
|WEST LOS ANGELES| 3950.890550890165|6193|
|         TOPANGA| 3818.025338208304|5971|
| NORTH HOLLYWOOD|3194.6900764893703|5748|
|        WILSHIRE| 2556.598961706193|5731|
|       NORTHEAST|3695.2562270997964|5726|
|          HARBOR|4047.4501072731864|5554|
|         MISSION| 3553.593615521547|5415|
|       HOLLYWOOD| 2840.164535299212|4933|
|      HOLLENBECK|3365.6869973683088|4618|
|       SOUTHWEST|2295.1702816435145|4482|
|      DEVONSHIRE|3083.4413145949866|4437|
|       SOUTHEAST| 3321.595088585187|4347|
|     77TH STREET|1931.1552614873922|3370|
|          NEWTON|1638.2180223355483|3009|
|         RAMPART|1926.0054134765985|2817|
|         OLYMPIC| 1726.857324744103|2760|
+----------

In [33]:
# query 5 conf 2

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import col
from sedona.spark import *
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number
import time

spark = SparkSession \
    .builder \
    .config("spark.executor.instances", "4")\
    .config("spark.executor.cores", "2")\
    .config("spark.executor.memory", "4g")\
    .appName("Query 5") \
    .getOrCreate()
        
sedona = SedonaContext.create(spark)

start_time=time.time()

#Precint data handling
precincts_df = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/LA_Police_Stations.csv", header=True)\
    .withColumnRenamed("DIVISION","division")\
    .withColumn("geom", ST_Point(F.col("X"),F.col("Y")))\
    .select("division","geom")


#Crime data handling
crime_df = spark.read.parquet("s3://groups-bucket-dblab-905418150721/group35/main_dataset_parquet") \
    .filter((col("LON") != 0) | (col("LAT") != 0))  \
    .withColumn("crime_geom", ST_Point(F.col("LON"),F.col("LAT")))\
    .select("crime_geom")

#Calculating the distance of each case with each precinct
joined_df = precincts_df.crossJoin(crime_df) \
    .withColumn("distance", ST_DistanceSphere(col("geom"), F.col("crime_geom")))

#Finding the closest precinct to each crime
window_spec = Window.partitionBy("crime_geom").orderBy(F.col("distance"))

closest_division_df = joined_df.withColumn("rank", row_number().over(window_spec))\
    .filter(F.col("rank") == 1) \
    .select("division", "crime_geom", "distance") 
    
result_df = closest_division_df.groupBy("division") \
    .agg(
        F.avg("distance").alias("avg_distance"),
        F.count("crime_geom").alias("#"),
    )\
    .orderBy("#",ascending=False)    
 
result_df.show()

end_time = time.time()

print(f"Time taken: {end_time-start_time:.2f} seconds")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------------+------------------+----+
|        division|      avg_distance|   #|
+----------------+------------------+----+
|         PACIFIC|3617.7734337150914|7654|
|        FOOTHILL| 4811.394973345948|7318|
|     WEST VALLEY|3363.5469483212823|6469|
|        VAN NUYS| 3430.765144800873|6453|
|WEST LOS ANGELES| 3950.890550890165|6193|
|         TOPANGA| 3818.025338208304|5971|
| NORTH HOLLYWOOD|3194.6900764893703|5748|
|        WILSHIRE| 2556.598961706193|5731|
|       NORTHEAST|3695.2562270997964|5726|
|          HARBOR|4047.4501072731864|5554|
|         MISSION| 3553.593615521547|5415|
|       HOLLYWOOD| 2840.164535299212|4933|
|      HOLLENBECK|3365.6869973683088|4618|
|       SOUTHWEST|2295.1702816435145|4482|
|      DEVONSHIRE|3083.4413145949866|4437|
|       SOUTHEAST| 3321.595088585187|4347|
|     77TH STREET|1931.1552614873922|3370|
|          NEWTON|1638.2180223355483|3009|
|         RAMPART|1926.0054134765985|2817|
|         OLYMPIC| 1726.857324744103|2760|
+----------

In [34]:
# query 5 conf 3

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import col
from sedona.spark import *
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number
import time

spark = SparkSession \
    .builder \
    .config("spark.executor.instances", "8")\
    .config("spark.executor.cores", "1")\
    .config("spark.executor.memory", "2g")\
    .appName("Query 5") \
    .getOrCreate()
        
sedona = SedonaContext.create(spark)

start_time=time.time()

#Precint data handling
precincts_df = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/LA_Police_Stations.csv", header=True)\
    .withColumnRenamed("DIVISION","division")\
    .withColumn("geom", ST_Point(F.col("X"),F.col("Y")))\
    .select("division","geom")


#Crime data handling
crime_df = spark.read.parquet("s3://groups-bucket-dblab-905418150721/group35/main_dataset_parquet") \
    .filter((col("LON") != 0) | (col("LAT") != 0))  \
    .withColumn("crime_geom", ST_Point(F.col("LON"),F.col("LAT")))\
    .select("crime_geom")

#Calculating the distance of each case with each precinct
joined_df = precincts_df.crossJoin(crime_df) \
    .withColumn("distance", ST_DistanceSphere(col("geom"), F.col("crime_geom")))

#Finding the closest precinct to each crime
window_spec = Window.partitionBy("crime_geom").orderBy(F.col("distance"))

closest_division_df = joined_df.withColumn("rank", row_number().over(window_spec))\
    .filter(F.col("rank") == 1) \
    .select("division", "crime_geom", "distance") 
    
result_df = closest_division_df.groupBy("division") \
    .agg(
        F.avg("distance").alias("avg_distance"),
        F.count("crime_geom").alias("#"),
    )\
    .orderBy("#",ascending=False)    
 
result_df.show()

end_time = time.time()

print(f"Time taken: {end_time-start_time:.2f} seconds")

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------------+------------------+----+
|        division|      avg_distance|   #|
+----------------+------------------+----+
|         PACIFIC|3617.7734337150914|7654|
|        FOOTHILL| 4811.394973345948|7318|
|     WEST VALLEY|3363.5469483212823|6469|
|        VAN NUYS| 3430.765144800873|6453|
|WEST LOS ANGELES| 3950.890550890165|6193|
|         TOPANGA| 3818.025338208304|5971|
| NORTH HOLLYWOOD|3194.6900764893703|5748|
|        WILSHIRE| 2556.598961706193|5731|
|       NORTHEAST|3695.2562270997964|5726|
|          HARBOR|4047.4501072731864|5554|
|         MISSION| 3553.593615521547|5415|
|       HOLLYWOOD| 2840.164535299212|4933|
|      HOLLENBECK|3365.6869973683088|4618|
|       SOUTHWEST|2295.1702816435145|4482|
|      DEVONSHIRE|3083.4413145949866|4437|
|       SOUTHEAST| 3321.595088585187|4347|
|     77TH STREET|1931.1552614873922|3370|
|          NEWTON|1638.2180223355483|3009|
|         RAMPART|1926.0054134765985|2817|
|         OLYMPIC| 1726.857324744103|2760|
+----------

In [40]:
##attempt to increase the performanse for query 5

from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.functions import col
from sedona.spark import *
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number
import time

spark = SparkSession \
    .builder \
    .config("spark.executor.instances", "8")\
    .config("spark.executor.cores", "2")\
    .config("spark.executor.memory", "2g")\
    .appName("Query 5") \
    .getOrCreate()
        
sedona = SedonaContext.create(spark)

start_time = time.time()

# Precinct data handling
precincts_df = spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/LA_Police_Stations.csv", header=True) \
    .withColumnRenamed("DIVISION", "division") \
    .withColumn("geom", ST_Point(F.col("X"), F.col("Y"))) \
    .select("division", "geom") \
    .repartition(16)  

# Crime data handling
crime_df = spark.read.parquet("s3://groups-bucket-dblab-905418150721/group35/main_dataset_parquet") \
    .filter((col("LON") != 0) | (col("LAT") != 0)) \
    .withColumn("crime_geom", ST_Point(F.col("LON"), F.col("LAT"))) \
    .select("crime_geom") \
    .repartition(16)  

# Calculating the distance of each case with each precinct
joined_df = precincts_df.crossJoin(crime_df) \
    .withColumn("distance", ST_DistanceSphere(col("geom"), F.col("crime_geom"))) \
    .repartition(16)  

# Finding the closest precinct to each crime
window_spec = Window.partitionBy("crime_geom").orderBy(F.col("distance"))

closest_division_df = joined_df.withColumn("rank", row_number().over(window_spec)) \
    .filter(F.col("rank") == 1) \
    .select("division", "crime_geom", "distance") \
    .repartition(16)  

result_df = closest_division_df.groupBy("division") \
    .agg(
        F.avg("distance").alias("avg_distance"),
        F.count("crime_geom").alias("#"),
    ) \
    .orderBy("#", ascending=False) \
    .repartition(16)  

result_df.show()

end_time = time.time()

print(f"Time taken: {end_time - start_time:.2f} seconds")


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+----------------+------------------+----+
|        division|      avg_distance|   #|
+----------------+------------------+----+
|          HARBOR|4047.4501072731864|5554|
|        FOOTHILL|4811.3949733459485|7318|
|     77TH STREET|1931.1552614873917|3370|
|      HOLLENBECK| 3365.686997368309|4618|
|         MISSION|  3553.59361552155|5415|
|         OLYMPIC| 1726.857324744103|2760|
|         RAMPART|1926.0054134765985|2817|
|       SOUTHWEST| 2295.170281643515|4482|
|WEST LOS ANGELES|3950.8905508901653|6193|
|         CENTRAL|1438.9179813974613|2055|
|         TOPANGA|3818.0253382083033|5971|
|         PACIFIC|3617.7734337150905|7654|
|       NORTHEAST|3695.2562270997955|5726|
|          NEWTON|1638.2180223355472|3009|
|       HOLLYWOOD| 2840.164535299211|4933|
| NORTH HOLLYWOOD|3194.6900764893708|5748|
|        VAN NUYS| 3430.765144800873|6453|
|        WILSHIRE|2556.5989617061946|5731|
|     WEST VALLEY|3363.5469483212823|6469|
|       SOUTHEAST|3321.5950885851876|4347|
+----------