In [2]:
# Query 1 RDD

import csv,time
from io import StringIO
from pyspark.sql import SparkSession
from pyspark.sql import Row

sc = SparkSession \
    .builder \
    .appName("Query 1 RDD") \
    .config("spark.executor.instances", "4") \
    .getOrCreate() \
    .sparkContext

def parse_csv_line(line):
    f = StringIO(line)
    reader = csv.reader(f)
    return next(reader) 

def help1(data):
    try:
        age=int(data)
        if age<18 and age>0:
            return "child"
        if age<25:
            return "young adult"
        if age<65 :
            return "adult"
        if age>64:
            return "old"
        else:
            return "no individual victim"
    except:
        return "error"
    

start_time = time.time()
    
rdd1  = sc.textFile("s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv")\
.map(parse_csv_line)

rdd2= sc.textFile("s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv")\
.map(parse_csv_line)

header1 = rdd1.first()
header2 = rdd2.first()

# Filter out the header
rdd1_data = rdd1.filter(lambda line: line != header1)
rdd2_data = rdd2.filter(lambda line: line != header2)

crime_data = rdd1_data.union(rdd2_data) \
.filter(lambda pair: pair[9].find("AGGRAVATED") != -1 ) \
.map(lambda data: (help1(data[11]), 1)) \
.reduceByKey(lambda a, b: a + b) \
.sortBy( lambda pair : pair[1], ascending=False )

print(crime_data.collect())

end_time = time.time()

print(f"Time taken: {end_time-start_time:.2f} seconds")


FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

[('adult', 121093), ('young adult', 38703), ('child', 10830), ('old', 5985)]
Time taken: 26.51 seconds

In [3]:
####query 1 dataframe

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, TimestampType
from pyspark.sql.functions import udf, col
from pyspark.sql import functions as F
import time


spark = SparkSession \
    .builder \
    .appName("Query 1 Dataframe") \
    .config("spark.executor.instances", "4") \
    .getOrCreate() 


def age_group(age_str):
    try:
        age=int(age_str)
        if age<18 and age>0:
            return "child"
        if age<25:
            return "young adult"
        if age<65 :
            return "adult"
        if age>64:
            return "old"
        else:
            return "no individual victim"
    except:
        return "error"


start_time=time.time()
dataframe1= spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv", header=True)
dataframe2= spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv",header=True)

age_udf=udf(age_group,StringType())
dataframe=dataframe1.union(dataframe2)\
.filter(col("Crm Cd Desc").contains("AGGRAVATED"))\
.withColumn("age_group",age_udf(col("Vict Age")))\
.groupBy("age_group").agg(F.count("*").alias("count"))\
.orderBy("count",ascending=False)


dataframe.show()


end_time = time.time()

print(f"Time taken: {end_time-start_time:.2f} seconds")



FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

+-----------+------+
|  age_group| count|
+-----------+------+
|      adult|121093|
|young adult| 38703|
|      child| 10830|
|        old|  5985|
+-----------+------+

Time taken: 18.34 seconds

In [4]:
#query 2 a

from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, FloatType, TimestampType
from pyspark.sql.functions import udf, col,count,when
from pyspark.sql import functions as F
from pyspark.sql.window import Window

import time


spark = SparkSession \
    .builder \
    .appName("Query 2 Dataframe") \
    .getOrCreate() 


window_spec = Window.partitionBy("Year").orderBy(F.desc("closed_case_rate"))


start_time=time.time()


dataframe1= spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2010_to_2019_20241101.csv", header=True)
dataframe2= spark.read.csv("s3://initial-notebook-data-bucket-dblab-905418150721/CrimeData/Crime_Data_from_2020_to_Present_20241101.csv",header=True)



dataframe=dataframe1.union(dataframe2)\
.withColumn("year",col("Date Rptd").substr(7,4))\
.select("year","AREA NAME","Status")\
.groupBy("year","AREA NAME").agg((count(when(col("Status") != "IC", 1)) / count("*") ).alias("closed_case_rate"))\
.withColumn("#", F.row_number().over(window_spec) )\
.filter(col("#") <= 3)\
.withColumnRenamed("AREA NAME", "precinct")

dataframe.show(24)

end_time = time.time()

print(f"Time taken: {end_time-start_time:.2f} seconds")



FloatProgress(value=0.0, bar_style='info', description='Progress:', layout=Layout(height='25px', width='50%'),…

An error was encountered:
An error occurred while calling o608.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 3 in stage 11.0 failed 4 times, most recent failure: Lost task 3.3 in stage 11.0 (TID 34) (ip-192-168-1-174.eu-central-1.compute.internal executor 15): org.apache.spark.SparkUpgradeException: [INCONSISTENT_BEHAVIOR_CROSS_VERSION.PARSE_DATETIME_BY_NEW_PARSER] You may get a different result due to the upgrading to Spark >= 3.0:
Fail to parse '05/22/2013 12:00:00 AM' in the new parser. You can set "spark.sql.legacy.timeParserPolicy" to "LEGACY" to restore the behavior before Spark 3.0, or set to "CORRECTED" and treat it as an invalid datetime string.
	at org.apache.spark.sql.errors.ExecutionErrors.failToParseDateTimeInNewParserError(ExecutionErrors.scala:54)
	at org.apache.spark.sql.errors.ExecutionErrors.failToParseDateTimeInNewParserError$(ExecutionErrors.scala:48)
	at org.apache.spark.sql.errors.ExecutionErrors$.failToParseDateTimeInNewPar