In [1]:
# analyse LA crime data 

In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.window import *

In [2]:
spark = SparkSession.builder.config("spark.sql.legacy.timeParserPolicy", "LEGACY").getOrCreate()

23/05/30 17:49:18 WARN Utils: Your hostname, Ravis-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.6.23 instead (on interface en0)
23/05/30 17:49:18 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/05/30 17:49:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
23/05/30 17:49:20 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
crime_df = spark.read.csv("la_crime_data.csv", header=True, inferSchema=True)

                                                                                

In [4]:
# check for duplicates
# check if record number is unique or now
subquery_dup = crime_df.\
groupBy("DR_NO")\
.agg(count("*").alias("crime_cnt"))

result_dup = subquery_dup.\
filter("crime_cnt > 1")\
.select("DR_NO")\
.distinct()

result_dup.show()


                                                                                

+-----+
|DR_NO|
+-----+
+-----+



In [5]:
# data source = https://catalog.data.gov/dataset/crime-data-from-2020-to-present
# the dataset has which years data?
# Which crime saw the most increase in 2021 compared to 2020?
# A year has four seasons: Winter: Jan - Mar; Spring: Apr - Jun; Summer: Jul - Sep; Fall: Oct - Dec
# what are the top three crime by each season? Do analysis for 2022.
# What are the top three crimes faced by this age bracket in 2022:
# 20-29; 30-39; 40-49; 50-59
# Give the top 3 crimes at STREET premise for years 2020 - 2023?

In [6]:
# the dataset has which years data?
years_df = crime_df.withColumn("date_formatted1", to_timestamp("DATE OCC", "MM/dd/yyyy hh:mm:ss a")
                              )
years_present = years_df.select(year("date_formatted1").alias("years_present")).distinct()
years_present.show(10)

[Stage 7:=====>                                                    (1 + 9) / 10]

+-------------+
|years_present|
+-------------+
|         2020|
|         2021|
|         2022|
|         2023|
+-------------+



                                                                                

In [7]:
# crime_df.printSchema()
years_df.printSchema()

root
 |-- DR_NO: integer (nullable = true)
 |-- Date Rptd: string (nullable = true)
 |-- DATE OCC: string (nullable = true)
 |-- TIME OCC: integer (nullable = true)
 |-- AREA: integer (nullable = true)
 |-- AREA NAME: string (nullable = true)
 |-- Rpt Dist No: integer (nullable = true)
 |-- Part 1-2: integer (nullable = true)
 |-- Crm Cd: integer (nullable = true)
 |-- Crm Cd Desc: string (nullable = true)
 |-- Mocodes: string (nullable = true)
 |-- Vict Age: integer (nullable = true)
 |-- Vict Sex: string (nullable = true)
 |-- Vict Descent: string (nullable = true)
 |-- Premis Cd: integer (nullable = true)
 |-- Premis Desc: string (nullable = true)
 |-- Weapon Used Cd: integer (nullable = true)
 |-- Weapon Desc: string (nullable = true)
 |-- Status: string (nullable = true)
 |-- Status Desc: string (nullable = true)
 |-- Crm Cd 1: integer (nullable = true)
 |-- Crm Cd 2: integer (nullable = true)
 |-- Crm Cd 3: integer (nullable = true)
 |-- Crm Cd 4: integer (nullable = true)
 |-- L

In [8]:
# Which crime saw the most increase in 2021 compared to 2020?
curtailed_dataset_df = years_df.select(years_df.date_formatted1.alias("crime_date"), \
                        "Crm Cd", "Crm Cd Desc", "DR_NO")
# count the crimes
crimes_cnt = curtailed_dataset_df\
        .groupBy(year("crime_date").alias("crime_year"), col("Crm Cd").alias("crime_code")\
                , col("Crm Cd Desc").alias("crime_code_desc")\
                )\
        .agg(countDistinct(col("DR_NO")).alias("crime_cnt"))
# using spark sql
crimes_cnt.createOrReplaceTempView("crimes_counts")
most_increase_2021 = spark.sql(\
f"""
WITH CTE AS 
(
SELECT
A.crime_year,
A.crime_code,
A.crime_code_desc,
(A.crime_cnt - B.crime_cnt) as diff
FROM crimes_counts A
JOIN crimes_counts B ON A.crime_year-1 = B.crime_year AND B.crime_code = A.crime_code
WHERE A.crime_year = 2021
)

SELECT 
crime_code,
crime_code_desc,
diff as 2021_increase
FROM CTE
ORDER BY diff DESC
LIMIT 1
""")
most_increase_2021.show()

# in pyspark
cte = crimes_cnt.alias("A").join(crimes_cnt.alias("B"), 
                                   ((col("A.crime_year") - 1) == col("B.crime_year")) & (col("A.crime_code") == col("B.crime_code")), 
                                   "inner") \
    .where(col("A.crime_year") == 2021) \
    .select(col("A.crime_year"), 
            col("A.crime_code"), 
            col("A.crime_code_desc"), 
            (col("A.crime_cnt") - col("B.crime_cnt")).alias("diff"))

result = cte.select("crime_code", "crime_code_desc", "diff")\
.orderBy(col("diff").desc())\
.limit(1)

result.show()

23/05/30 17:49:36 WARN package: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
                                                                                

+----------+----------------+-------------+
|crime_code| crime_code_desc|2021_increase|
+----------+----------------+-------------+
|       510|VEHICLE - STOLEN|         2899|
+----------+----------------+-------------+



                                                                                

+----------+----------------+----+
|crime_code| crime_code_desc|diff|
+----------+----------------+----+
|       510|VEHICLE - STOLEN|2899|
+----------+----------------+----+



In [9]:
# A year has four seasons: Winter: Jan - Mar; Spring: Apr - Jun; Summer: Jul - Sep; Fall: Oct - Dec
# what are the top three crime by each season? Do analysis for 2022
formatted_dataset_df = years_df.select(years_df.date_formatted1.alias("crime_date"),\
                                    col("Crm Cd").alias("crime_code"), col("Crm Cd Desc").alias("crime_code_desc"),\
                                       "DR_NO")

formatted_dataset_df.createOrReplaceTempView("dataset")
# in spark sql
query = spark.sql(f"""
WITH CTE AS 
(
SELECT 
month(crime_date) as crime_month,
CASE 
WHEN month(crime_date) < 4 THEN 'winter'
WHEN month(crime_date) > 3 and month(crime_date) < 7 THEN 'spring'
WHEN month(crime_date) > 6 and month(crime_date) < 10 THEN 'summer'
ELSE 'fall' END as season,
DR_NO,
crime_code,
crime_code_desc 
FROM dataset
WHERE year(crime_date) = 2021
)

SELECT
season,
crime_code,
crime_code_desc,
crimes_cnt
FROM 
(
SELECT 
season,
crime_code,
crime_code_desc,
DENSE_RANK() OVER (PARTITION BY season ORDER BY COUNT(DISTINCT DR_NO) DESC) AS crimes_rnk,
COUNT(DISTINCT DR_NO) AS crimes_cnt
FROM CTE
GROUP BY 1,2,3
) temp
WHERE crimes_rnk < 4
ORDER BY 1,crimes_rnk ASC
""")
query.show()



+------+----------+--------------------+----------+
|season|crime_code|     crime_code_desc|crimes_cnt|
+------+----------+--------------------+----------+
|  fall|       510|    VEHICLE - STOLEN|      6869|
|  fall|       624|BATTERY - SIMPLE ...|      4112|
|  fall|       330|BURGLARY FROM VEH...|      3888|
|spring|       510|    VEHICLE - STOLEN|      5437|
|spring|       624|BATTERY - SIMPLE ...|      3958|
|spring|       740|VANDALISM - FELON...|      3391|
|summer|       510|    VEHICLE - STOLEN|      5844|
|summer|       624|BATTERY - SIMPLE ...|      4641|
|summer|       740|VANDALISM - FELON...|      3824|
|winter|       510|    VEHICLE - STOLEN|      5512|
|winter|       624|BATTERY - SIMPLE ...|      3477|
|winter|       330|BURGLARY FROM VEH...|      3167|
+------+----------+--------------------+----------+



                                                                                

In [10]:
# in pyspark
cte = formatted_dataset_df.select(month(col("crime_date")).alias("crime_month"),
        when(month(col("crime_date")).between(1,3), "winter")
        .when(month(col("crime_date")).between(4,6), "spring")
        .when(month(col("crime_date")).between(7,9), "summer")
        .otherwise("fall").alias("season"),
        col("DR_NO"), col("crime_code"), col("crime_code_desc"))\
.where(year(col("crime_date")) == 2021)

subquery = cte.groupBy("season", "crime_code", "crime_code_desc")\
.agg(countDistinct("DR_NO").alias("crime_cnt"))

window_spec = Window.partitionBy("season").orderBy(col("crime_cnt").desc())
subquery1 = subquery.withColumn("crimes_rnk", dense_rank().over(window_spec))

result = subquery1.where(col("crimes_rnk") < 4)\
.select("season", "crime_code", "crime_code_desc", "crime_cnt")\
.orderBy("season", "crimes_rnk")

result.show()




+------+----------+--------------------+---------+
|season|crime_code|     crime_code_desc|crime_cnt|
+------+----------+--------------------+---------+
|  fall|       510|    VEHICLE - STOLEN|     6869|
|  fall|       624|BATTERY - SIMPLE ...|     4112|
|  fall|       330|BURGLARY FROM VEH...|     3888|
|spring|       510|    VEHICLE - STOLEN|     5437|
|spring|       624|BATTERY - SIMPLE ...|     3958|
|spring|       740|VANDALISM - FELON...|     3391|
|summer|       510|    VEHICLE - STOLEN|     5844|
|summer|       624|BATTERY - SIMPLE ...|     4641|
|summer|       740|VANDALISM - FELON...|     3824|
|winter|       510|    VEHICLE - STOLEN|     5512|
|winter|       624|BATTERY - SIMPLE ...|     3477|
|winter|       330|BURGLARY FROM VEH...|     3167|
+------+----------+--------------------+---------+



                                                                                

In [11]:
cte = formatted_dataset_df.select(month(col("crime_date")).alias("crime_month"),
            when(month(col("crime_date")).between(1,3), "winter")\
        .when(month(col("crime_date")).between(4,6), "spring")\
        .when(month(col("crime_date")).between(7,9), "summer")\
        .otherwise("fall").alias("season"),
        col("DR_NO"), col("crime_code"), col("crime_code_desc"))\
    .where(year(col("crime_date")) == 2021)
subquery = cte.groupBy(col("season"), col("crime_code"), col("crime_code_desc"))\
.agg(countDistinct(col("DR_NO")).alias("crimes_cnt"))
window_spec = Window.partitionBy(col("season")).orderBy(col("crimes_cnt").desc())
subquery1 = subquery.withColumn("crimes_rnk", dense_rank().over(window_spec))
result = subquery1.where(col("crimes_rnk") < 4)\
.select("season", "crime_code", "crime_code_desc", "crimes_cnt")\
.orderBy("season", "crimes_rnk")

result.show()



+------+----------+--------------------+----------+
|season|crime_code|     crime_code_desc|crimes_cnt|
+------+----------+--------------------+----------+
|  fall|       510|    VEHICLE - STOLEN|      6869|
|  fall|       624|BATTERY - SIMPLE ...|      4112|
|  fall|       330|BURGLARY FROM VEH...|      3888|
|spring|       510|    VEHICLE - STOLEN|      5437|
|spring|       624|BATTERY - SIMPLE ...|      3958|
|spring|       740|VANDALISM - FELON...|      3391|
|summer|       510|    VEHICLE - STOLEN|      5844|
|summer|       624|BATTERY - SIMPLE ...|      4641|
|summer|       740|VANDALISM - FELON...|      3824|
|winter|       510|    VEHICLE - STOLEN|      5512|
|winter|       624|BATTERY - SIMPLE ...|      3477|
|winter|       330|BURGLARY FROM VEH...|      3167|
+------+----------+--------------------+----------+



                                                                                

In [12]:
# What are the top three crimes faced by this age bracket in 2022:
# 20-29; 30-39; 40-49; 50-59
age_dataset = years_df.select(years_df.date_formatted1.alias("crime_date"),\
            col("Crm Cd").alias("crime_code"), col("Crm Cd Desc").alias("crime_code_desc"),\
            col("Vict Age").alias("age"),"DR_NO")

filtered_dataset = age_dataset.filter(col("age").between(20,59))\
 .filter(year(col("crime_date")) == 2022)

# in spark sql
filtered_dataset.createOrReplaceTempView("table")
query = spark.sql(
f"""
WITH CTE AS (
SELECT 
CASE 
WHEN age between 20 and 29 THEN '20-29'
WHEN age between 30 and 39 THEN '30-39'
WHEN age between 40 and 49 THEN '40-49'
ELSE '50-59' END AS age_bracket,
crime_code,
crime_code_desc,
COUNT(DISTINCT DR_NO) AS crime_cnt
FROM table
GROUP BY 1, 2, 3
),
crimes_rnk AS
(
SELECT
age_bracket,
crime_code,
crime_code_desc,
crime_cnt,
DENSE_RANK() OVER (PARTITION BY age_bracket ORDER BY crime_cnt DESC) AS rnk
FROM CTE
)

SELECT 
age_bracket,
crime_code,
crime_code_desc,
crime_cnt
FROM crimes_rnk
WHERE rnk < 4
ORDER BY 1,4 DESC
""")
query.show()



+-----------+----------+--------------------+---------+
|age_bracket|crime_code|     crime_code_desc|crime_cnt|
+-----------+----------+--------------------+---------+
|      20-29|       354|   THEFT OF IDENTITY|     4678|
|      20-29|       330|BURGLARY FROM VEH...|     4059|
|      20-29|       624|BATTERY - SIMPLE ...|     3877|
|      30-39|       354|   THEFT OF IDENTITY|     7126|
|      30-39|       330|BURGLARY FROM VEH...|     4486|
|      30-39|       624|BATTERY - SIMPLE ...|     3995|
|      40-49|       354|   THEFT OF IDENTITY|     4189|
|      40-49|       624|BATTERY - SIMPLE ...|     2993|
|      40-49|       330|BURGLARY FROM VEH...|     2447|
|      50-59|       624|BATTERY - SIMPLE ...|     2833|
|      50-59|       354|   THEFT OF IDENTITY|     2530|
|      50-59|       230|ASSAULT WITH DEAD...|     1695|
+-----------+----------+--------------------+---------+



                                                                                

In [20]:
# in pyspark
age_bracket_df = filtered_dataset.withColumn("age_bracket", when(col("age").between(20, 29), "20-29")\
                .when(col("age").between(30, 39), "30-39")\
                .when(col("age").between(40, 49), "40-49")\
                .otherwise("50-59"))
cte_df = age_bracket_df.groupBy("age_bracket", "crime_code","crime_code_desc")\
.agg(countDistinct(col("DR_NO")).alias("crime_cnt"))

window_spec = Window.partitionBy("age_bracket").orderBy(col("crime_cnt").desc())

result_df = cte_df.withColumn("rnk", dense_rank().over(window_spec))\
.where(col("rnk") < 4)\
.orderBy("age_bracket", "crime_cnt")\
.select("age_bracket","crime_code","crime_code_desc","crime_cnt")

result_df.show()



+-----------+----------+--------------------+---------+
|age_bracket|crime_code|     crime_code_desc|crime_cnt|
+-----------+----------+--------------------+---------+
|      20-29|       624|BATTERY - SIMPLE ...|     3877|
|      20-29|       330|BURGLARY FROM VEH...|     4059|
|      20-29|       354|   THEFT OF IDENTITY|     4678|
|      30-39|       624|BATTERY - SIMPLE ...|     3995|
|      30-39|       330|BURGLARY FROM VEH...|     4486|
|      30-39|       354|   THEFT OF IDENTITY|     7126|
|      40-49|       330|BURGLARY FROM VEH...|     2447|
|      40-49|       624|BATTERY - SIMPLE ...|     2993|
|      40-49|       354|   THEFT OF IDENTITY|     4189|
|      50-59|       230|ASSAULT WITH DEAD...|     1695|
|      50-59|       354|   THEFT OF IDENTITY|     2530|
|      50-59|       624|BATTERY - SIMPLE ...|     2833|
+-----------+----------+--------------------+---------+



                                                                                

In [58]:
# Show the top three crimes on the street for years 2022-2023
result_df = years_df.select(
    col("Premis Cd").alias("premise_code"),
    col("Premis Desc").alias("premise_desc"),
    col("Crm Cd Desc").alias("crime_description"),
    col("DR_NO")
).where(
    (year(col("date_formatted1")).isin([2022, 2023])) & col("premise_desc").like("%STREET%")
).groupBy("premise_code", "premise_desc", "crime_description").agg(
    countDistinct("DR_NO").alias("crime_cnt")
).select(
    "premise_code", "premise_desc", "crime_description", "crime_cnt"
).orderBy(
    col("crime_cnt").desc()
).limit(3)


# Show the top three crimes on the street
result_df.show()

+------------+------------+--------------------+---------+
|premise_code|premise_desc|   crime_description|crime_cnt|
+------------+------------+--------------------+---------+
|         101|      STREET|    VEHICLE - STOLEN|    26068|
|         101|      STREET|BURGLARY FROM VEH...|     7376|
|         101|      STREET|THEFT FROM MOTOR ...|     7142|
+------------+------------+--------------------+---------+

