In [34]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.types import FloatType, StructType, StringType, TimestampType, StructField, IntegerType
from pyspark.sql.functions import UserDefinedFunction, col, when, to_timestamp, to_utc_timestamp

1. Specify the schema for the crime data set. (https://sparkbyexamples.com/pyspark/pyspark-structtype-and-structfield/)

In [35]:
crime_schema = StructType([
    StructField('X', FloatType()),
    StructField('Y', FloatType()),
    StructField('RowID', IntegerType()),
    StructField('CrimeDateTime', StringType()),
    StructField('CrimeCode', StringType()),
    StructField('Location', StringType()),
    StructField('Description', StringType()),
    StructField('Inside_Outside', StringType()),
    StructField('Weapon', StringType()),
    StructField('Post', StringType()),
    StructField('District', StringType()),
    StructField('Neighborhood', StringType()),
    StructField('Latitude', FloatType()),
    StructField('Longitude', FloatType()),
    StructField('Geolocation', FloatType()),
    StructField('Premise', StringType()),
    StructField('VRIName', StringType()),
    StructField('Total_Incidents', StringType())])

2. Read the file using the schema definition

In [36]:
spark = SparkSession.builder.getOrCreate()
df = spark.read.options(header = 'True').schema(crime_schema).csv('Part1_Crime_data.csv')
name = 'CrimeDateTime'
#udf = UserDefinedFunction(lambda x: x[:-3]+'.000'.replace('/', '-'), StringType())
#df1 = df.select(*[udf(column).alias(name) if column == name else column for column in df.columns])
#df2 = df1.withColumn('CrimeDateTime',col('CrimeDateTime').cast(TimestampType()))


3. Cache the DataFrame (https://sparkbyexamples.com/spark/spark-dataframe-cache-and-persist-explained/)

In [39]:
df1 = df.cache()

4. Show the count of the rows

In [40]:
df1.count()

PythonException: 
  An exception was thrown from the Python worker. Please see the stack trace below.
Traceback (most recent call last):
  File "C:\Users\Peter\anaconda3\Lib\site-packages\pyspark\python\lib\pyspark.zip\pyspark\worker.py", line 473, in main
Exception: Python in worker has different version 3.9 than that in driver 3.8, PySpark cannot run with different minor versions. Please check environment variables PYSPARK_PYTHON and PYSPARK_DRIVER_PYTHON are correctly set.


5. Print the schema

In [None]:
df1.printSchema()

6. Display first 5 rows

In [None]:
df1.show(5)

1. What are distinct crime codes?

In [None]:
df1.createOrReplaceTempView('crime_codes')

query = """
SELECT DISTINCT CrimeCode
FROM crime_codes
"""
output = spark.sql(query)
output.show(100)

2. Count the number of crimes by the crime codes and order by the resulting counts in descending order

In [None]:
df1.createOrReplaceTempView('crime_code_count')

query = """
SELECT CrimeCode, COUNT(*) as count
FROM crime_code_count
GROUP BY CrimeCode
ORDER BY count DESC
"""
output = spark.sql(query)
output.show(100)

3. Which neighborhood had most crimes?

In [None]:
df1.createOrReplaceTempView('nb_most_crimes')

query = """
SELECT Neighborhood, COUNT(*) as count
FROM nb_most_crimes
GROUP BY Neighborhood
ORDER BY count
"""
output = spark.sql(query)
output.show(1, truncate = False)

4. Which month of the year had most crimes?

In [None]:
df1.createOrReplaceTempView('most_crimes')

query = """
SELECT SUBSTRING(CrimeDateTime, 6, 2) as m, COUNT(*) as count
FROM most_crimes
GROUP BY m
ORDER BY count DESC
"""
output = spark.sql(query)
output.show(12, truncate = False)

5. What weapons were used? 

In [None]:
df1.createOrReplaceTempView('weapons_used')

query = """
SELECT Weapon
FROM weapons_used
WHERE Weapon is not null and Weapon != 'NA'
GROUP BY Weapon
"""
output = spark.sql(query)
output.show(10, truncate = False)

6. Which weapon was used the most? 

In [None]:
df1.createOrReplaceTempView('most_used_weapon')

query = """
SELECT Weapon, COUNT(*) as count
FROM most_used_weapon
WHERE Weapon is not null and Weapon != 'NA'
GROUP BY Weapon
ORDER BY count DESC
"""
output = spark.sql(query)
output.show(1, truncate = False)