In [1]:
from pyspark.sql import SparkSession

# Δημιουργία SparkSession
spark = SparkSession.builder \
    .appName("Victim Descent Analysis SQL") \
    .config("spark.executor.instances", "2") \
    .config("spark.executor.cores", "1") \
    .config("spark.executor.memory", "2g") \
    .getOrCreate()

# Φόρτωση δεδομένων από S3
crime_data_path = "s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv"
income_data_path = "s3://initial-notebook-data-bucket-dblab-905418150721/LA_income_2015.csv"
race_codes_path = "s3://initial-notebook-data-bucket-dblab-905418150721/RE_codes.csv"

crime_df = spark.read.csv(crime_data_path, header=True, inferSchema=True)
income_df = spark.read.csv(income_data_path, header=True, inferSchema=True)
race_codes_df = spark.read.csv(race_codes_path, header=True, inferSchema=True)

# Δημιουργία προσωρινών πινάκων
crime_df.createOrReplaceTempView("crimes")
income_df.createOrReplaceTempView("income")
race_codes_df.createOrReplaceTempView("race_codes")

# Ερώτημα: Φιλτράρισμα δεδομένων εγκλημάτων για το 2015
crime_2015_query = """
SELECT *
FROM crimes
WHERE YEAR(TO_DATE(SUBSTRING(`DATE OCC`, 1, 10), 'MM/dd/yyyy')) = 2015
"""
crime_2015_df = spark.sql(crime_2015_query)
crime_2015_df.createOrReplaceTempView("crime_2015")

# Ερώτημα: Εύρεση περιοχών με υψηλότερο και χαμηλότερο εισόδημα
top_3_income_query = """
SELECT `Zip Code`, `Estimated Median Income`
FROM income
ORDER BY `Estimated Median Income` DESC
LIMIT 3
"""
bottom_3_income_query = """
SELECT `Zip Code`, `Estimated Median Income`
FROM income
ORDER BY `Estimated Median Income` ASC
LIMIT 3
"""
top_3_income_df = spark.sql(top_3_income_query)
bottom_3_income_df = spark.sql(bottom_3_income_query)

# Λίστες με περιοχές υψηλού/χαμηλού εισοδήματος
top_3_income_areas = [row["Zip Code"] for row in top_3_income_df.collect()]
bottom_3_income_areas = [row["Zip Code"] for row in bottom_3_income_df.collect()]

# Δημιουργία πινάκων για περιοχές υψηλού/χαμηλού εισοδήματος
crime_top_income_query = f"""
SELECT *
FROM crime_2015
WHERE `AREA NAME` IN ({', '.join([f"'{area}'" for area in top_3_income_areas])})
"""
crime_bottom_income_query = f"""
SELECT *
FROM crime_2015
WHERE `AREA NAME` IN ({', '.join([f"'{area}'" for area in bottom_3_income_areas])})
"""
crime_top_income_df = spark.sql(crime_top_income_query)
crime_bottom_income_df = spark.sql(crime_bottom_income_query)

# Δημιουργία προσωρινών πινάκων για τα αποτελέσματα
crime_top_income_df.createOrReplaceTempView("crime_top_income")
crime_bottom_income_df.createOrReplaceTempView("crime_bottom_income")

# Ερώτημα: Ανάλυση εγκλημάτων ανά φυλετικό προφίλ για υψηλό εισόδημα
victims_by_descent_top_query = """
SELECT rc.`Vict Descent Full` AS Ethnicity, COUNT(*) AS Victim_Count
FROM crime_top_income AS cti
LEFT JOIN race_codes AS rc
ON cti.`Vict Descent` = rc.`Vict Descent`
GROUP BY rc.`Vict Descent Full`
ORDER BY Victim_Count DESC
"""
victims_by_descent_bottom_query = """
SELECT rc.`Vict Descent Full` AS Ethnicity, COUNT(*) AS Victim_Count
FROM crime_bottom_income AS cbi
LEFT JOIN race_codes AS rc
ON cbi.`Vict Descent` = rc.`Vict Descent`
GROUP BY rc.`Vict Descent Full`
ORDER BY Victim_Count DESC
"""

# Εκτέλεση ερωτημάτων
victims_by_descent_top_df = spark.sql(victims_by_descent_top_query)
victims_by_descent_bottom_df = spark.sql(victims_by_descent_bottom_query)

# Εκτύπωση αποτελεσμάτων
print("Victim Descent Analysis - Top 3 Income Areas:")
victims_by_descent_top_df.show()

print("\nVictim Descent Analysis - Bottom 3 Income Areas:")
victims_by_descent_bottom_df.show()




Starting Spark application


ID,YARN Application ID,Kind,State,Spark UI,Driver log,User,Current session?
4208,application_1732639283265_4148,pyspark,idle,Link,Link,,✔


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

SparkSession available as 'spark'.


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

Victim Descent Analysis - Top 3 Income Areas:
+---------+------------+
|Ethnicity|Victim_Count|
+---------+------------+
+---------+------------+


Victim Descent Analysis - Bottom 3 Income Areas:
+---------+------------+
|Ethnicity|Victim_Count|
+---------+------------+
+---------+------------+