In [11]:
%%configure -f
{
    "conf": {
        "spark.executor.instances": "2",
        "spark.executor.memory": "2g",
        "spark.executor.cores": "1",
        "spark.driver.memory": "2g"
    }
}

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1629,application_1732639283265_1590,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1562,application_1732639283265_1523,pyspark,idle,Link,Link,,
1608,application_1732639283265_1569,pyspark,idle,Link,Link,,
1622,application_1732639283265_1583,pyspark,idle,Link,Link,,
1623,application_1732639283265_1584,pyspark,idle,Link,Link,,
1624,application_1732639283265_1585,pyspark,idle,Link,Link,,
1626,application_1732639283265_1587,pyspark,idle,Link,Link,,
1628,application_1732639283265_1589,pyspark,idle,Link,Link,,
1629,application_1732639283265_1590,pyspark,idle,Link,Link,,✔


In [12]:
# Access configuration
conf = spark.sparkContext.getConf()

# Print relevant executor settings
print("Executor Instances:", conf.get("spark.executor.instances"))
print("Executor Memory:", conf.get("spark.executor.memory"))
print("Executor Cores:", conf.get("spark.executor.cores"))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Executor Instances: 2
Executor Memory: 2g
Executor Cores: 1

In [2]:
# %%configure -f
# {
#     "conf": {
#         "spark.executor.instances": "2",
#         "spark.executor.memory": "2g",
#         "spark.executor.cores": "1",
#         "spark.driver.memory": "2g"
#     }
# }

Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1449,application_1732639283265_1410,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1330,application_1732639283265_1292,pyspark,idle,Link,Link,,
1331,application_1732639283265_1293,pyspark,idle,Link,Link,,
1348,application_1732639283265_1310,pyspark,idle,Link,Link,,
1390,application_1732639283265_1352,pyspark,idle,Link,Link,,
1403,application_1732639283265_1365,pyspark,idle,Link,Link,,
1404,application_1732639283265_1366,pyspark,idle,Link,Link,,
1407,application_1732639283265_1369,pyspark,idle,Link,Link,,
1410,application_1732639283265_1372,pyspark,idle,Link,Link,,
1412,application_1732639283265_1374,pyspark,idle,Link,Link,,
1413,application_1732639283265_1375,pyspark,idle,Link,Link,,


In [3]:
import time
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, regexp_replace, to_timestamp, year
from pyspark.sql.types import IntegerType
from sedona.register import SedonaRegistrator
from sedona.spark import *



# Start the timer
start_time = time.time()

sedona = SedonaContext.create(spark)
# Read the file from s3
geojson_path = "s3://initial-notebook-data-bucket-dblab-905418150721/2010_Census_Blocks.geojson"
blocks_df = sedona.read.format("geojson") \
            .option("multiLine", "true").load(geojson_path) \
            .selectExpr("explode(features) as features") \
            .select("features.*")
# Formatting magic
flattened_df = blocks_df.select( \
                [col(f"properties.{col_name}").alias(col_name) for col_name in \
                blocks_df.schema["properties"].dataType.fieldNames()] + ["geometry"]) \
            .drop("properties") \
            .drop("type")


# Load Crime Data
crime_df = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv",
    header=True,
    inferSchema=True
)

# Transform (LAT, LON) into a geometry column
crime_df = crime_df.withColumn("geom", ST_Point("LON", "LAT"))


# Read Median Income Data
income_df = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/LA_income_2015.csv",
    header=True,
    inferSchema=True
)
income_df = income_df.withColumn(
    "Median Income",
    regexp_replace(col("Estimated Median Income"), "[$,]", "").cast(IntegerType())
)
income_df = income_df.withColumnRenamed("Zip Code", "ZIP Code")

# First Join: Join Census Data with Median Income using ZIP Code and ZCTA10
census_income_df = flattened_df.join(
    income_df,
    flattened_df["ZCTA10"] == income_df["ZIP Code"],
    "inner"
).select(
    "ZCTA10", "geometry", "Community", "Median Income"
)

# Read Race and Ethnicity Mapping Data from CSV
race_ethnicity_df = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/RE_codes.csv",
    header=True,
    inferSchema=True
)

# Second Join: Join Census-Income Data with Crime Data using Geometry Columns
final_joined_df = census_income_df.join(
    crime_df,
    ST_Within(crime_df.geom, census_income_df.geometry),
    "inner"
).select(
    col("DATE OCC").alias("Date"),
    "Median Income",
    "Community",
    col("Vict Descent").alias("Victim Descent")
)

# Step 1: Convert 'DATE OCC' from string to TimestampType (with date and time)
final_joined_df = final_joined_df.withColumn(
    "DateParsed",
    to_timestamp(col("Date"), "MM/dd/yyyy hh:mm:ss a")  # Adjusting for MM/dd/yyyy hh:mm:ss a format
)

# Step 2: Filter for crimes that happened in 2015
final_joined_2015_df = final_joined_df.filter(year(col("DateParsed")) == 2015)


# Step 3: Join with Race and Ethnicity Mapping
final_with_race_df = final_joined_2015_df.join(
    race_ethnicity_df,
    final_joined_2015_df["Victim Descent"] == race_ethnicity_df["Vict Descent"],
    "inner"
).drop(race_ethnicity_df["Vict Descent"])

# Rename the 'Community' column in both DataFrames to avoid ambiguity
final_with_race_df = final_with_race_df.withColumnRenamed("Community", "Community_final")
census_income_df = census_income_df.withColumnRenamed("Community", "Community_census")

# Step 4: Calculate the top 3 highest and lowest income areas
income_ranking_df = census_income_df.groupBy("Community_census").agg({"Median Income": "avg"}).withColumnRenamed("avg(Median Income)", "Average Income")

top_3_highest_income = income_ranking_df.orderBy(col("Average Income").desc()).limit(3)
top_3_lowest_income = income_ranking_df.orderBy(col("Average Income").asc()).limit(3)

# Filter final_with_race_df for the top 3 highest and lowest income areas
high_income_df = final_with_race_df.join(
    top_3_highest_income,
    final_with_race_df["Community_final"] == top_3_highest_income["Community_census"]
)

low_income_df = final_with_race_df.join(
    top_3_lowest_income,
    final_with_race_df["Community_final"] == top_3_lowest_income["Community_census"]
)

# Step 5: Calculate the racial profile for the top 3 highest income areas
high_income_race_profile_df = high_income_df.groupBy("Vict Descent Full").count().orderBy("count", ascending=False)

# Step 6: Calculate the racial profile for the top 3 lowest income areas
low_income_race_profile_df = low_income_df.groupBy("Vict Descent Full").count().orderBy("count", ascending=False)

# Step 7: Prepare results for the top 3 highest income areas
print("Racial Profile for the Top 3 Highest Income Areas:")
high_income_race_profile_df = high_income_race_profile_df.withColumnRenamed("Vict Descent Full", "Victim Descent") \
                                                         .withColumnRenamed("count", "#")
high_income_race_profile_df.show()

# Step 8: Prepare results for the top 3 lowest income areas
print("Racial Profile for the Top 3 Lowest Income Areas:")
low_income_race_profile_df = low_income_race_profile_df.withColumnRenamed("Vict Descent Full", "Victim Descent") \
                                                       .withColumnRenamed("count", "#")
low_income_race_profile_df.show()

# End the timer
end_time = time.time()

# Print the execution time
execution_time = end_time - start_time
print(f"Execution Time: {execution_time:.2f} seconds")
spark.stop()

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Racial Profile for the Top 3 Highest Income Areas:
+--------------------+---+
|      Victim Descent|  #|
+--------------------+---+
|               White|645|
|               Other|123|
|Hispanic/Latin/Me...| 71|
|             Unknown| 48|
|               Black| 38|
|         Other Asian| 24|
|             Chinese|  1|
|American Indian/A...|  1|
+--------------------+---+

Racial Profile for the Top 3 Lowest Income Areas:
+--------------------+---+
|      Victim Descent|  #|
+--------------------+---+
|Hispanic/Latin/Me...|799|
|               Black|329|
|               White|284|
|               Other|187|
|         Other Asian| 37|
|             Unknown|  9|
|              Korean|  4|
|American Indian/A...|  1|
|    Pacific Islander|  1|
+--------------------+---+

Execution Time: 156.42 seconds

In [4]:
# Access configuration
conf = spark.sparkContext.getConf()

# Print relevant executor settings
print("Executor Instances:", conf.get("spark.executor.instances"))
print("Executor Memory:", conf.get("spark.executor.memory"))
print("Executor Cores:", conf.get("spark.executor.cores"))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Executor Instances: 2
Executor Memory: 2g
Executor Cores: 1

In [5]:
%%configure -f
{
    "conf": {
        "spark.executor.instances": "2",
        "spark.executor.memory": "4g",
        "spark.executor.cores": "2",
        "spark.driver.memory": "4g"
    }
}


Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1450,application_1732639283265_1411,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1330,application_1732639283265_1292,pyspark,idle,Link,Link,,
1331,application_1732639283265_1293,pyspark,idle,Link,Link,,
1348,application_1732639283265_1310,pyspark,idle,Link,Link,,
1390,application_1732639283265_1352,pyspark,idle,Link,Link,,
1403,application_1732639283265_1365,pyspark,idle,Link,Link,,
1404,application_1732639283265_1366,pyspark,idle,Link,Link,,
1407,application_1732639283265_1369,pyspark,idle,Link,Link,,
1410,application_1732639283265_1372,pyspark,idle,Link,Link,,
1412,application_1732639283265_1374,pyspark,idle,Link,Link,,
1413,application_1732639283265_1375,pyspark,idle,Link,Link,,


In [6]:
import time
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, regexp_replace, to_timestamp, year
from pyspark.sql.types import IntegerType
from sedona.register import SedonaRegistrator
from sedona.spark import *



# Start the timer
start_time = time.time()

sedona = SedonaContext.create(spark)
# Read the file from s3
geojson_path = "s3://initial-notebook-data-bucket-dblab-905418150721/2010_Census_Blocks.geojson"
blocks_df = sedona.read.format("geojson") \
            .option("multiLine", "true").load(geojson_path) \
            .selectExpr("explode(features) as features") \
            .select("features.*")
# Formatting magic
flattened_df = blocks_df.select( \
                [col(f"properties.{col_name}").alias(col_name) for col_name in \
                blocks_df.schema["properties"].dataType.fieldNames()] + ["geometry"]) \
            .drop("properties") \
            .drop("type")


# Load Crime Data
crime_df = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv",
    header=True,
    inferSchema=True
)

# Transform (LAT, LON) into a geometry column
crime_df = crime_df.withColumn("geom", ST_Point("LON", "LAT"))


# Read Median Income Data
income_df = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/LA_income_2015.csv",
    header=True,
    inferSchema=True
)
income_df = income_df.withColumn(
    "Median Income",
    regexp_replace(col("Estimated Median Income"), "[$,]", "").cast(IntegerType())
)
income_df = income_df.withColumnRenamed("Zip Code", "ZIP Code")

# First Join: Join Census Data with Median Income using ZIP Code and ZCTA10
census_income_df = flattened_df.join(
    income_df,
    flattened_df["ZCTA10"] == income_df["ZIP Code"],
    "inner"
).select(
    "ZCTA10", "geometry", "Community", "Median Income"
)

# Read Race and Ethnicity Mapping Data from CSV
race_ethnicity_df = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/RE_codes.csv",
    header=True,
    inferSchema=True
)

# Second Join: Join Census-Income Data with Crime Data using Geometry Columns
final_joined_df = census_income_df.join(
    crime_df,
    ST_Within(crime_df.geom, census_income_df.geometry),
    "inner"
).select(
    col("DATE OCC").alias("Date"),
    "Median Income",
    "Community",
    col("Vict Descent").alias("Victim Descent")
)

# Step 1: Convert 'DATE OCC' from string to TimestampType (with date and time)
final_joined_df = final_joined_df.withColumn(
    "DateParsed",
    to_timestamp(col("Date"), "MM/dd/yyyy hh:mm:ss a")  # Adjusting for MM/dd/yyyy hh:mm:ss a format
)

# Step 2: Filter for crimes that happened in 2015
final_joined_2015_df = final_joined_df.filter(year(col("DateParsed")) == 2015)


# Step 3: Join with Race and Ethnicity Mapping
final_with_race_df = final_joined_2015_df.join(
    race_ethnicity_df,
    final_joined_2015_df["Victim Descent"] == race_ethnicity_df["Vict Descent"],
    "inner"
).drop(race_ethnicity_df["Vict Descent"])

# Rename the 'Community' column in both DataFrames to avoid ambiguity
final_with_race_df = final_with_race_df.withColumnRenamed("Community", "Community_final")
census_income_df = census_income_df.withColumnRenamed("Community", "Community_census")

# Step 4: Calculate the top 3 highest and lowest income areas
income_ranking_df = census_income_df.groupBy("Community_census").agg({"Median Income": "avg"}).withColumnRenamed("avg(Median Income)", "Average Income")

top_3_highest_income = income_ranking_df.orderBy(col("Average Income").desc()).limit(3)
top_3_lowest_income = income_ranking_df.orderBy(col("Average Income").asc()).limit(3)

# Filter final_with_race_df for the top 3 highest and lowest income areas
high_income_df = final_with_race_df.join(
    top_3_highest_income,
    final_with_race_df["Community_final"] == top_3_highest_income["Community_census"]
)

low_income_df = final_with_race_df.join(
    top_3_lowest_income,
    final_with_race_df["Community_final"] == top_3_lowest_income["Community_census"]
)

# Step 5: Calculate the racial profile for the top 3 highest income areas
high_income_race_profile_df = high_income_df.groupBy("Vict Descent Full").count().orderBy("count", ascending=False)

# Step 6: Calculate the racial profile for the top 3 lowest income areas
low_income_race_profile_df = low_income_df.groupBy("Vict Descent Full").count().orderBy("count", ascending=False)

# Step 7: Prepare results for the top 3 highest income areas
print("Racial Profile for the Top 3 Highest Income Areas:")
high_income_race_profile_df = high_income_race_profile_df.withColumnRenamed("Vict Descent Full", "Victim Descent") \
                                                         .withColumnRenamed("count", "#")
high_income_race_profile_df.show()

# Step 8: Prepare results for the top 3 lowest income areas
print("Racial Profile for the Top 3 Lowest Income Areas:")
low_income_race_profile_df = low_income_race_profile_df.withColumnRenamed("Vict Descent Full", "Victim Descent") \
                                                       .withColumnRenamed("count", "#")
low_income_race_profile_df.show()

# End the timer
end_time = time.time()

# Print the execution time
execution_time = end_time - start_time
print(f"Execution Time: {execution_time:.2f} seconds")


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Racial Profile for the Top 3 Highest Income Areas:
+--------------------+---+
|      Victim Descent|  #|
+--------------------+---+
|               White|645|
|               Other|123|
|Hispanic/Latin/Me...| 71|
|             Unknown| 48|
|               Black| 38|
|         Other Asian| 24|
|             Chinese|  1|
|American Indian/A...|  1|
+--------------------+---+

Racial Profile for the Top 3 Lowest Income Areas:
+--------------------+---+
|      Victim Descent|  #|
+--------------------+---+
|Hispanic/Latin/Me...|799|
|               Black|329|
|               White|284|
|               Other|187|
|         Other Asian| 37|
|             Unknown|  9|
|              Korean|  4|
|American Indian/A...|  1|
|    Pacific Islander|  1|
+--------------------+---+

Execution Time: 139.03 seconds

In [7]:
# Access configuration
conf = spark.sparkContext.getConf()

# Print relevant executor settings
print("Executor Instances:", conf.get("spark.executor.instances"))
print("Executor Memory:", conf.get("spark.executor.memory"))
print("Executor Cores:", conf.get("spark.executor.cores"))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Executor Instances: 2
Executor Memory: 4g
Executor Cores: 2

In [8]:
%%configure -f
{
    "conf": {
        "spark.executor.instances": "2",
        "spark.executor.memory": "8g",
        "spark.executor.cores": "4",
        "spark.driver.memory": "8g"
    }
}


Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1451,application_1732639283265_1412,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
1330,application_1732639283265_1292,pyspark,idle,Link,Link,,
1331,application_1732639283265_1293,pyspark,idle,Link,Link,,
1348,application_1732639283265_1310,pyspark,idle,Link,Link,,
1390,application_1732639283265_1352,pyspark,idle,Link,Link,,
1403,application_1732639283265_1365,pyspark,idle,Link,Link,,
1404,application_1732639283265_1366,pyspark,idle,Link,Link,,
1407,application_1732639283265_1369,pyspark,idle,Link,Link,,
1410,application_1732639283265_1372,pyspark,idle,Link,Link,,
1412,application_1732639283265_1374,pyspark,idle,Link,Link,,
1413,application_1732639283265_1375,pyspark,idle,Link,Link,,


In [9]:
import time
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, regexp_replace, to_timestamp, year
from pyspark.sql.types import IntegerType
from sedona.register import SedonaRegistrator
from sedona.spark import *



# Start the timer
start_time = time.time()

sedona = SedonaContext.create(spark)
# Read the file from s3
geojson_path = "s3://initial-notebook-data-bucket-dblab-905418150721/2010_Census_Blocks.geojson"
blocks_df = sedona.read.format("geojson") \
            .option("multiLine", "true").load(geojson_path) \
            .selectExpr("explode(features) as features") \
            .select("features.*")
# Formatting magic
flattened_df = blocks_df.select( \
                [col(f"properties.{col_name}").alias(col_name) for col_name in \
                blocks_df.schema["properties"].dataType.fieldNames()] + ["geometry"]) \
            .drop("properties") \
            .drop("type")


# Load Crime Data
crime_df = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv",
    header=True,
    inferSchema=True
)

# Transform (LAT, LON) into a geometry column
crime_df = crime_df.withColumn("geom", ST_Point("LON", "LAT"))


# Read Median Income Data
income_df = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/LA_income_2015.csv",
    header=True,
    inferSchema=True
)
income_df = income_df.withColumn(
    "Median Income",
    regexp_replace(col("Estimated Median Income"), "[$,]", "").cast(IntegerType())
)
income_df = income_df.withColumnRenamed("Zip Code", "ZIP Code")

# First Join: Join Census Data with Median Income using ZIP Code and ZCTA10
census_income_df = flattened_df.join(
    income_df,
    flattened_df["ZCTA10"] == income_df["ZIP Code"],
    "inner"
).select(
    "ZCTA10", "geometry", "Community", "Median Income"
)

# Read Race and Ethnicity Mapping Data from CSV
race_ethnicity_df = spark.read.csv(
    "s3://initial-notebook-data-bucket-dblab-905418150721/RE_codes.csv",
    header=True,
    inferSchema=True
)

# Second Join: Join Census-Income Data with Crime Data using Geometry Columns
final_joined_df = census_income_df.join(
    crime_df,
    ST_Within(crime_df.geom, census_income_df.geometry),
    "inner"
).select(
    col("DATE OCC").alias("Date"),
    "Median Income",
    "Community",
    col("Vict Descent").alias("Victim Descent")
)

# Step 1: Convert 'DATE OCC' from string to TimestampType (with date and time)
final_joined_df = final_joined_df.withColumn(
    "DateParsed",
    to_timestamp(col("Date"), "MM/dd/yyyy hh:mm:ss a")  # Adjusting for MM/dd/yyyy hh:mm:ss a format
)

# Step 2: Filter for crimes that happened in 2015
final_joined_2015_df = final_joined_df.filter(year(col("DateParsed")) == 2015)


# Step 3: Join with Race and Ethnicity Mapping
final_with_race_df = final_joined_2015_df.join(
    race_ethnicity_df,
    final_joined_2015_df["Victim Descent"] == race_ethnicity_df["Vict Descent"],
    "inner"
).drop(race_ethnicity_df["Vict Descent"])

# Rename the 'Community' column in both DataFrames to avoid ambiguity
final_with_race_df = final_with_race_df.withColumnRenamed("Community", "Community_final")
census_income_df = census_income_df.withColumnRenamed("Community", "Community_census")

# Step 4: Calculate the top 3 highest and lowest income areas
income_ranking_df = census_income_df.groupBy("Community_census").agg({"Median Income": "avg"}).withColumnRenamed("avg(Median Income)", "Average Income")

top_3_highest_income = income_ranking_df.orderBy(col("Average Income").desc()).limit(3)
top_3_lowest_income = income_ranking_df.orderBy(col("Average Income").asc()).limit(3)

# Filter final_with_race_df for the top 3 highest and lowest income areas
high_income_df = final_with_race_df.join(
    top_3_highest_income,
    final_with_race_df["Community_final"] == top_3_highest_income["Community_census"]
)

low_income_df = final_with_race_df.join(
    top_3_lowest_income,
    final_with_race_df["Community_final"] == top_3_lowest_income["Community_census"]
)

# Step 5: Calculate the racial profile for the top 3 highest income areas
high_income_race_profile_df = high_income_df.groupBy("Vict Descent Full").count().orderBy("count", ascending=False)

# Step 6: Calculate the racial profile for the top 3 lowest income areas
low_income_race_profile_df = low_income_df.groupBy("Vict Descent Full").count().orderBy("count", ascending=False)

# Step 7: Prepare results for the top 3 highest income areas
print("Racial Profile for the Top 3 Highest Income Areas:")
high_income_race_profile_df = high_income_race_profile_df.withColumnRenamed("Vict Descent Full", "Victim Descent") \
                                                         .withColumnRenamed("count", "#")
high_income_race_profile_df.show()

# Step 8: Prepare results for the top 3 lowest income areas
print("Racial Profile for the Top 3 Lowest Income Areas:")
low_income_race_profile_df = low_income_race_profile_df.withColumnRenamed("Vict Descent Full", "Victim Descent") \
                                                       .withColumnRenamed("count", "#")
low_income_race_profile_df.show()

# End the timer
end_time = time.time()

# Print the execution time
execution_time = end_time - start_time
print(f"Execution Time: {execution_time:.2f} seconds")


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Racial Profile for the Top 3 Highest Income Areas:
+--------------------+---+
|      Victim Descent|  #|
+--------------------+---+
|               White|645|
|               Other|123|
|Hispanic/Latin/Me...| 71|
|             Unknown| 48|
|               Black| 38|
|         Other Asian| 24|
|             Chinese|  1|
|American Indian/A...|  1|
+--------------------+---+

Racial Profile for the Top 3 Lowest Income Areas:
+--------------------+---+
|      Victim Descent|  #|
+--------------------+---+
|Hispanic/Latin/Me...|799|
|               Black|329|
|               White|284|
|               Other|187|
|         Other Asian| 37|
|             Unknown|  9|
|              Korean|  4|
|American Indian/A...|  1|
|    Pacific Islander|  1|
+--------------------+---+

Execution Time: 136.01 seconds

In [10]:
# Access configuration
conf = spark.sparkContext.getConf()

# Print relevant executor settings
print("Executor Instances:", conf.get("spark.executor.instances"))
print("Executor Memory:", conf.get("spark.executor.memory"))
print("Executor Cores:", conf.get("spark.executor.cores"))

FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Executor Instances: 2
Executor Memory: 8g
Executor Cores: 4