### Downloading dataset

In [2]:
!wget -nc -O ../Datasets/3_crimes.csv https://huggingface.co/datasets/gymprathap/Chicago-Crime-Dataset/resolve/main/Crimes_-_2001_to_Present_20240713.csv


File ‘../Datasets/3_crimes.csv’ already there; not retrieving.


### Spark setup

In [1]:
# Basic imports
import pyspark
from pyspark.sql import SparkSession
import pyspark.sql.functions as F

In [2]:
# Build SparkSession
spark = SparkSession.builder \
    .appName("CrimesFix") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "8g") \
    .config("spark.sql.shuffle.partitions", "16") \
    .getOrCreate()

## Inicial data inspection

### Importing dataset

In [4]:
# Reading data
file_transactions = '/home/jovyan/code/Projecto/Datasets/3_crimes.csv'

! head $file_transactions
df_crimes = spark.read.csv(file_transactions, header=True, sep=',', inferSchema=True)


ID,Case Number,Date,Block,IUCR,Primary Type,Description,Location Description,Arrest,Domestic,Beat,District,Ward,Community Area,FBI Code,X Coordinate,Y Coordinate,Year,Updated On,Latitude,Longitude,Location
11037294,JA371270,03/18/2015 12:00:00 PM,0000X W WACKER DR,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,BANK,false,false,0111,001,42,32,11,,,2015,08/01/2017 03:52:26 PM,,,
11646293,JC213749,12/20/2018 03:00:00 PM,023XX N LOCKWOOD AVE,1154,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT $300 AND UNDER,APARTMENT,false,false,2515,025,36,19,11,,,2018,04/06/2019 04:04:43 PM,,,
11645836,JC212333,05/01/2016 12:25:00 AM,055XX S ROCKWELL ST,1153,DECEPTIVE PRACTICE,FINANCIAL IDENTITY THEFT OVER $ 300,,false,false,0824,008,15,63,11,,,2016,04/06/2019 04:04:43 PM,,,
11645959,JC211511,12/20/2018 04:00:00 PM,045XX N ALBANY AVE,2820,OTHER OFFENSE,TELEPHONE THREAT,RESIDENCE,false,false,1724,017,33,14,08A,,,2018,04/06/2019 04:04:43 PM,,,
11645601,JC212935,06/01/2014 12:01:00 AM,087XX S SANG

### Check top 10 rows of the inicial dataset and atributes' datatypes

In [5]:
df_crimes.show(10)
print(f'df_crimes - number of rows is {df_crimes.count()    }.')
df_crimes.printSchema()

+--------+-----------+--------------------+--------------------+----+------------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+--------+---------+--------+
|      ID|Case Number|                Date|               Block|IUCR|      Primary Type|         Description|Location Description|Arrest|Domestic|Beat|District|Ward|Community Area|FBI Code|X Coordinate|Y Coordinate|Year|          Updated On|Latitude|Longitude|Location|
+--------+-----------+--------------------+--------------------+----+------------------+--------------------+--------------------+------+--------+----+--------+----+--------------+--------+------------+------------+----+--------------------+--------+---------+--------+
|11037294|   JA371270|03/18/2015 12:00:...|   0000X W WACKER DR|1153|DECEPTIVE PRACTICE|FINANCIAL IDENTIT...|                BANK| false|   false| 111|       1|  42|            32|      11| 

### Duplicates and NULLs

In [6]:
# Now let's check the number of duplicated rows
print(f'df_crimes - number of rows is {df_crimes.count()  }; after dropDuplicates() applied would be {df_crimes.dropDuplicates().count()   }.')

df_crimes - number of rows is 8104658; after dropDuplicates() applied would be 8104658.


In [7]:
# Check for null values
print(f'''df_crimes - number of rows after dropna(how='any') would be {df_crimes.dropna(how='any').count()     }.''')

df_crimes - number of rows after dropna(how='any') would be 7399802.


In [None]:
# There is half a million of nulls in the dataset, so let's check how many nulls are in each column
print('Checking nulls at each column of df_crimes')
dict_nulls_retail = {col: df_crimes.filter(df_crimes[col].isNull()).count() for col in df_crimes.columns}
dict_nulls_retail

Checking nulls at each column of df_crimes


{'ID': 0,
 'Case Number': 0,
 'Date': 0,
 'Block': 0,
 'IUCR': 0,
 'Primary Type': 0,
 'Description': 0,
 'Location Description': 13257,
 'Arrest': 0,
 'Domestic': 0,
 'Beat': 0,
 'District': 47,
 'Ward': 614831,
 'Community Area': 613455,
 'FBI Code': 0,
 'X Coordinate': 89642,
 'Y Coordinate': 89642,
 'Year': 0,
 'Updated On': 0,
 'Latitude': 89642,
 'Longitude': 89642,
 'Location': 89642}

In [None]:
#We see that the last 5 columns about location have a lot of nulls, so let's drop them, as they are not useful for our analysis. Because otherwise nulls removing can affect filled rows.

# Drop the specified columns from df_crimes
columns_to_drop = ['X Coordinate', 'Y Coordinate', 'Latitude', 'Longitude', 'Location']
df_crimes = df_crimes.drop(*columns_to_drop)

# Verify the remaining columns
df_crimes.printSchema()

root
 |-- ID: integer (nullable = true)
 |-- Case Number: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Block: string (nullable = true)
 |-- IUCR: string (nullable = true)
 |-- Primary Type: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Location Description: string (nullable = true)
 |-- Arrest: boolean (nullable = true)
 |-- Domestic: boolean (nullable = true)
 |-- Beat: integer (nullable = true)
 |-- District: integer (nullable = true)
 |-- Ward: integer (nullable = true)
 |-- Community Area: integer (nullable = true)
 |-- FBI Code: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Updated On: string (nullable = true)



In [15]:
# Check for null values again
print(f'''df_crimes - number of rows before {df_crimes.count()} after dropna(how='any') would be {df_crimes.dropna(how='any').count()     }.''')

df_crimes - number of rows before 8104658 after dropna(how='any') would be 7475294.


### Drop null values and make sure everything is clear

In [17]:
df_crimes = df_crimes.dropna(how='any')

In [None]:
print('Checking nulls at each column of df_crimes')
dict_nulls_retail = {col: df_crimes.filter(df_crimes[col].isNull()).count() for col in df_crimes.columns}
dict_nulls_retail

Checking nulls at each column of df_crimes


{'ID': 0,
 'Case Number': 0,
 'Date': 0,
 'Block': 0,
 'IUCR': 0,
 'Primary Type': 0,
 'Description': 0,
 'Location Description': 0,
 'Arrest': 0,
 'Domestic': 0,
 'Beat': 0,
 'District': 0,
 'Ward': 0,
 'Community Area': 0,
 'FBI Code': 0,
 'Year': 0,
 'Updated On': 0}

### Unique values

In [22]:
print('\nUniqueness of values:')
number_records = df_crimes.count()
cols_interest = df_crimes.columns
for cl in cols_interest:
    k = df_crimes.select(cl).distinct().count()
    print(f'Column {cl} has {k} unique values out of {number_records} records.')


Uniqueness of values:
Column ID has 7475294 unique values out of 7475294 records.
Column Case Number has 7474737 unique values out of 7475294 records.
Column Date has 3103277 unique values out of 7475294 records.
Column Block has 39071 unique values out of 7475294 records.
Column IUCR has 404 unique values out of 7475294 records.
Column Primary Type has 35 unique values out of 7475294 records.
Column Description has 551 unique values out of 7475294 records.
Column Location Description has 217 unique values out of 7475294 records.
Column Arrest has 2 unique values out of 7475294 records.
Column Domestic has 2 unique values out of 7475294 records.
Column Beat has 304 unique values out of 7475294 records.
Column District has 24 unique values out of 7475294 records.
Column Ward has 50 unique values out of 7475294 records.
Column Community Area has 78 unique values out of 7475294 records.
Column FBI Code has 27 unique values out of 7475294 records.
Column Year has 24 unique values out of 7

## Attributes review after a data inspection + necessary data transformation

### After a primary review of the variables and cleaning dataset from NULLs, we can execute detailed data exploration and transformation. 
### Let's break it down by attributes:

#### ID

This attribute represents the number of a row. Considering our work as a ML task, we do not need this column, therefore it can be removed.

In [39]:
df_crimes = df_crimes.drop("ID")
df_crimes.printSchema()

root
 |-- Case Number: string (nullable = true)
 |-- Date: string (nullable = true)
 |-- Block: string (nullable = true)
 |-- IUCR: string (nullable = true)
 |-- Primary Type: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Location Description: string (nullable = true)
 |-- Arrest: boolean (nullable = true)
 |-- Domestic: boolean (nullable = true)
 |-- Beat: integer (nullable = true)
 |-- District: integer (nullable = true)
 |-- Ward: integer (nullable = true)
 |-- Community Area: integer (nullable = true)
 |-- FBI Code: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Updated On: string (nullable = true)
 |-- Timestamp: timestamp (nullable = true)
 |-- Hour: integer (nullable = true)
 |-- Minute: integer (nullable = true)
 |-- Day: integer (nullable = true)
 |-- Month: integer (nullable = true)



#### Case Number

This attribute represents the official case number assigned by the police. As we can see, amount of case numbers is lower than the amount of rows, this can happen if:
- the same case involves more than one offense;
- there are multiple locations or suspects;
- there are different actions recorded for the same incident... 

Although number of them is small, we have to remove all of this cases because we can not review each of them and conclude the reason of duplicated case number.

**Therefore, we remove all cases where multiple rows belong to the same Case Number.**
**After that we remove this column, because it is irrelevant for the model.**

In [None]:
# First find Case Numbers that occur exactly once
case_counts = (
    df_crimes
    .groupBy("Case Number")
    .agg(F.count("*").alias("cnt"))
    .filter("cnt = 1")
    .select("Case Number")
)

# Now leave only those strings that refer to unique Case Number
df_crimes = df_crimes.join(case_counts, on="Case Number", how="inner")

k = df_crimes.select('Case Number').distinct().count()
print(f'Column Case Number has {k} unique values out of {number_records} records.')

Column Case Number has 7474272 unique values out of 7474272 records.


In [97]:
df_crimes = df_crimes.drop('Case Number')

#### Date

This attribute reflects the date and time of the offense in timestamp format. \
This information is very important. However, in order to analyze it, it is necessary to manipulate the data into an acceptable form. \
Let's break the data into pieces of Month, Day, Hour, Minute. Since we have Year column already in the initial dataset, we will move it upfront, so date columns will be placed together. \
After extracting parts of the Date, we can remove this column.

In [None]:
# Convert the Date column to timestamp and extract components
df_crimes = df_crimes.withColumn('Timestamp', F.to_timestamp(df_crimes['Date'], 'MM/dd/yyyy hh:mm:ss a'))
df_crimes = df_crimes.withColumn('Hour', F.hour(df_crimes['Timestamp']))
df_crimes = df_crimes.withColumn('Minute', F.minute(df_crimes['Timestamp']))
df_crimes = df_crimes.withColumn('Day', F.dayofmonth(df_crimes['Timestamp']))
df_crimes = df_crimes.withColumn('Month', F.month(df_crimes['Timestamp']))
df_crimes = df_crimes.drop('Timestamp')

#Make correct column order and removing date
desired_order = [
    "Year", "Month", "Day", "Hour", "Minute"
]
remaining_columns = [col for col in df_crimes.columns if col not in desired_order + ["Date"]]
new_column_order = desired_order + remaining_columns
df_crimes = df_crimes.select(*new_column_order)

#### Block

The street where the crime occurred, but not the full address - most often rounded up to the nearest block (for example: 010XX W 59TH ST). \ 
It was concidered unnecessary to extract something from this attribute and use it, due to its preciseness. For the geographical contextualization, we have other column to consider.

In [None]:
df_crimes = df_crimes.drop('Block')

root
 |-- Case Number: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- Day: integer (nullable = true)
 |-- Hour: integer (nullable = true)
 |-- Minute: integer (nullable = true)
 |-- IUCR: string (nullable = true)
 |-- Primary Type: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Location Description: string (nullable = true)
 |-- Arrest: boolean (nullable = true)
 |-- Domestic: boolean (nullable = true)
 |-- Beat: integer (nullable = true)
 |-- District: integer (nullable = true)
 |-- Ward: integer (nullable = true)
 |-- Community Area: integer (nullable = true)
 |-- FBI Code: string (nullable = true)
 |-- Updated On: string (nullable = true)
 |-- Timestamp: timestamp (nullable = true)



#### IUCR

The code assigned to the type of offense under the Illinois UCR system. \
The attribute fulfills all the requirements of the work, so no manipulation is required.

#### Primary Type

The primary type of crime. We have 35 types in our dataset. \
Have to check the unique values of a column, for contextual understanding of the dataset. \
Then create a numeric interpretation of this categorical attribute.

In [None]:
# Display all types of crimes (35)
df_crimes.select("Primary Type").distinct().show(35, truncate=False)

+---------------------------------+
|Primary Type                     |
+---------------------------------+
|OFFENSE INVOLVING CHILDREN       |
|HUMAN TRAFFICKING                |
|CRIMINAL DAMAGE                  |
|PUBLIC PEACE VIOLATION           |
|NON-CRIMINAL (SUBJECT SPECIFIED) |
|ROBBERY                          |
|KIDNAPPING                       |
|STALKING                         |
|INTIMIDATION                     |
|CONCEALED CARRY LICENSE VIOLATION|
|HOMICIDE                         |
|MOTOR VEHICLE THEFT              |
|CRIM SEXUAL ASSAULT              |
|LIQUOR LAW VIOLATION             |
|THEFT                            |
|ASSAULT                          |
|PROSTITUTION                     |
|WEAPONS VIOLATION                |
|INTERFERENCE WITH PUBLIC OFFICER |
|GAMBLING                         |
|NON - CRIMINAL                   |
|BATTERY                          |
|OBSCENITY                        |
|PUBLIC INDECENCY                 |
|DECEPTIVE PRACTICE         

In [None]:
from pyspark.ml.feature import StringIndexer
# Create an indexer
indexer = StringIndexer(inputCol="Primary Type", outputCol="Primary Type Num")
# Train the indexer and transform the data
df_crimes = indexer.fit(df_crimes).transform(df_crimes)
df_crimes = df_crimes.withColumn("Primary Type Num", F.col("Primary Type Num").cast("int"))

+-------------------+----------------+
|Primary Type       |Primary Type Num|
+-------------------+----------------+
|NARCOTICS          |3               |
|HOMICIDE           |21              |
|HOMICIDE           |21              |
|HOMICIDE           |21              |
|MOTOR VEHICLE THEFT|7               |
|BURGLARY           |6               |
|MOTOR VEHICLE THEFT|7               |
|MOTOR VEHICLE THEFT|7               |
|MOTOR VEHICLE THEFT|7               |
|THEFT              |0               |
+-------------------+----------------+
only showing top 10 rows



In [None]:
#Make correct column order
desired_order = [
    'Year', 'Month', 'Day', 'Hour', 'Minute', 'Primary Type Num', 'Primary Type'
]
remaining_columns = [col for col in df_crimes.columns if col not in desired_order]
new_column_order = desired_order + remaining_columns
df_crimes = df_crimes.select(*new_column_order)
df_crimes.printSchema()

root
 |-- Case Number: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- Day: integer (nullable = true)
 |-- Hour: integer (nullable = true)
 |-- Minute: integer (nullable = true)
 |-- Primary Type Num: integer (nullable = true)
 |-- Primary Type: string (nullable = true)
 |-- IUCR: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Location Description: string (nullable = true)
 |-- Arrest: boolean (nullable = true)
 |-- Domestic: boolean (nullable = true)
 |-- Beat: integer (nullable = true)
 |-- District: integer (nullable = true)
 |-- Ward: integer (nullable = true)
 |-- Community Area: integer (nullable = true)
 |-- FBI Code: string (nullable = true)
 |-- Updated On: string (nullable = true)
 |-- Timestamp: timestamp (nullable = true)



#### Description

To understand the data contained in this column, let's first of all check some of unique values

In [57]:
df_crimes.select('Description').distinct().show(truncate=False)

+------------------------------+
|Description                   |
+------------------------------+
|DOMESTIC BATTERY SIMPLE       |
|UNLAWFUL POSS OF HANDGUN      |
|ATT CRIM SEXUAL ABUSE         |
|TO RESIDENCE                  |
|ATTEMPT ARSON                 |
|CRIMINAL TRANSMISSION OF HIV  |
|PRO EMP HANDS NO/MIN INJURY   |
|UNLAWFUL USE/SALE AIR RIFLE   |
|MANU/DELIVER:AMPHETAMINES     |
|BOMB THREAT                   |
|ENDANGERING LIFE/HEALTH CHILD |
|ATTEMPT: ARMED-OTHER DANG WEAP|
|BY EXPLOSIVE                  |
|ATT: TRUCK, BUS, MOTOR HOME   |
|ATT AGG CRIMINAL SEXUAL ABUSE |
|FALSE FIRE ALARM              |
|OBSTRUCTING SERVICE           |
|AGG PRO.EMP: OTHER DANG WEAPON|
|AGGRAVATED OF A CHILD         |
|UNLAWFUL USE OF A COMPUTER    |
+------------------------------+
only showing top 20 rows



It was decided to exclude the Description column. \
This attribute contains about 551 unique values and represents a refinement to the more generalized attribute Primary Type. \
For this work, it was decided to focus on generalized crime categories. \

In [75]:
df_crimes = df_crimes.drop("Description")

#### Location Description

Where exactly did the crime take place. To clearly understand the attribute, let's check some of unique values

In [59]:
df_crimes.select('Location Description').distinct().show(truncate=False)

+-----------------------------------------+
|Location Description                     |
+-----------------------------------------+
|CHA HALLWAY                              |
|GAS STATION                              |
|CURRENCY EXCHANGE                        |
|CTA PLATFORM                             |
|SPORTS ARENA/STADIUM                     |
|CHURCH / SYNAGOGUE / PLACE OF WORSHIP    |
|TAVERN                                   |
|AIRPORT PARKING LOT                      |
|FARM                                     |
|VEHICLE - COMMERCIAL: TROLLEY BUS        |
|RESIDENCE                                |
|HOUSE                                    |
|VEHICLE NON-COMMERCIAL                   |
|APPLIANCE STORE                          |
|HOSPITAL                                 |
|SCHOOL - PRIVATE GROUNDS                 |
|OTHER RAILROAD PROPERTY / TRAIN DEPOT    |
|CTA PARKING LOT / GARAGE / OTHER PROPERTY|
|COLLEGE / UNIVERSITY - RESIDENCE HALL    |
|PARKING LOT/GARAGE(NON.RESID.) 

In [71]:
indexer = StringIndexer(
    inputCol="Location Description", 
    outputCol="Location Description Num"
)
df_crimes = indexer.fit(df_crimes).transform(df_crimes)
df_crimes = df_crimes.withColumn("Location Description Num", col("Location Description Num").cast("int"))
df_crimes.select("Location Description", "Location Description Num").show(10, truncate=False)

+--------------------+------------------------+
|Location Description|Location Description Num|
+--------------------+------------------------+
|SIDEWALK            |3                       |
|APARTMENT           |2                       |
|APARTMENT           |2                       |
|CHA STAIRWELL       |172                     |
|STREET              |0                       |
|OTHER               |4                       |
|STREET              |0                       |
|STREET              |0                       |
|STREET              |0                       |
|RESIDENCE           |1                       |
+--------------------+------------------------+
only showing top 10 rows



In [None]:
#Make correct column order
desired_order = [
    'Year', 'Month', 'Day', 'Hour', 'Minute', 'Primary Type Num', 'Primary Type', 'IUCR', 'Location Description Num', 'Location Description'
]
remaining_columns = [col for col in df_crimes.columns if col not in desired_order]
new_column_order = desired_order + remaining_columns
df_crimes = df_crimes.select(*new_column_order)
df_crimes.printSchema()

root
 |-- Case Number: string (nullable = true)
 |-- Year: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- Day: integer (nullable = true)
 |-- Hour: integer (nullable = true)
 |-- Minute: integer (nullable = true)
 |-- Primary Type Num: integer (nullable = true)
 |-- Primary Type: string (nullable = true)
 |-- IUCR: string (nullable = true)
 |-- Location Description Num: integer (nullable = true)
 |-- Location Description: string (nullable = true)
 |-- Arrest: boolean (nullable = true)
 |-- Domestic: boolean (nullable = true)
 |-- Beat: integer (nullable = true)
 |-- District: integer (nullable = true)
 |-- Ward: integer (nullable = true)
 |-- Community Area: integer (nullable = true)
 |-- FBI Code: string (nullable = true)
 |-- Updated On: string (nullable = true)



#### Arrest

Boolean column (true/false), indicates whether an arrest was made in this case. \
We need to transform it into a binary column (1/0)

In [None]:
# Convert the Boolean column to a binary numeric column (0/1)
df_crimes = df_crimes.withColumn("Arrest", F.col("Arrest").cast("int"))

#### Domestic

Boolean column (true/false), indicates whether the crime is related to domestic violence. \
We need to transform it into a binary column (1/0)

In [83]:
# Convert the Boolean column to a binary numeric column (0/1)
df_crimes = df_crimes.withColumn("Domestic", F.col("Domestic").cast("int"))

In [84]:
df_crimes.show(10)

+-----------+----+-----+---+----+------+----------------+-------------------+----+------------------------+--------------------+------+--------+----+--------+----+--------------+--------+--------------------+----------+
|Case Number|Year|Month|Day|Hour|Minute|Primary Type Num|       Primary Type|IUCR|Location Description Num|Location Description|Arrest|Domestic|Beat|District|Ward|Community Area|FBI Code|          Updated On|Arrest_Num|
+-----------+----+-----+---+----+------+----------------+-------------------+----+------------------------+--------------------+------+--------+----+--------+----+--------------+--------+--------------------+----------+
|  04X245238|2004|   12| 13|  21|    15|               3|          NARCOTICS|2024|                       3|            SIDEWALK|     1|       0|1122|      11|  27|            23|      18|02/28/2018 03:56:...|         1|
|    F780368|2018|    3|  3|  13|    25|              21|           HOMICIDE|0110|                       2|           AP

#### Beat

The smallest unit of a geographic division of the police force.\
Each Beat is part of a specific police district.\
Useful for operational analytics.\
It was decided not to make any manipulations and not to delete the column, as it may play an important role in the model, due to the fact that the operation of different units leads to different outcomes and there may be a pattern in it.

#### District, Ward, Community Area

These two geographical features indicate the following context:\
- **District:** Police District. A wider area that includes several Beats.
- **Ward:** A political-administrative district of Chicago. Used for municipal government.
- **Community Area:** A geographic unit of a city used for city planning.

All of the columns remain without any changes.

#### FBI Code

The FBI's categorization of the offense. Provides a standard for comparison with other regions and for aggregating data at the national level.\
Column remain without any changes.

#### Updated On

The date on which the record was updated in the database.\
Column is irrelevant for this work therefore is removed.

In [88]:
df_crimes = df_crimes.drop('Updated On')

### Quick data check after transformations

#### Unique values and columns list

In [89]:
print('\nUniqueness of values:')
number_records = df_crimes.count()
cols_interest = df_crimes.columns
for cl in cols_interest:
    k = df_crimes.select(cl).distinct().count()
    print(f'Column {cl} has {k} unique values out of {number_records} records.')


Uniqueness of values:
Column Case Number has 7474272 unique values out of 7474272 records.
Column Year has 24 unique values out of 7474272 records.
Column Month has 12 unique values out of 7474272 records.
Column Day has 31 unique values out of 7474272 records.
Column Hour has 24 unique values out of 7474272 records.
Column Minute has 60 unique values out of 7474272 records.
Column Primary Type Num has 35 unique values out of 7474272 records.
Column Primary Type has 35 unique values out of 7474272 records.
Column IUCR has 404 unique values out of 7474272 records.
Column Location Description Num has 216 unique values out of 7474272 records.
Column Location Description has 216 unique values out of 7474272 records.
Column Arrest has 2 unique values out of 7474272 records.
Column Domestic has 2 unique values out of 7474272 records.
Column Beat has 304 unique values out of 7474272 records.
Column District has 24 unique values out of 7474272 records.
Column Ward has 50 unique values out of 

## Data is ready!

## Statistical data exploration

#### Create dataframes containing numerical and categorical attributes

In [98]:
from pyspark.sql.types import NumericType, StringType, BooleanType

# Get the schema of the dataframe
schema = df_crimes.schema

# Identify numerical and categorical columns
# (assuming numerical columns are of type NumericType and categorical columns are of type StringType or BooleanType)
numerical_columns = [field.name for field in schema if isinstance(field.dataType, NumericType)]
categorical_columns = [field.name for field in schema if isinstance(field.dataType, (StringType, BooleanType))]

# Form two dataframes
numerical_df = df_crimes.select(*numerical_columns)
categorical_df = df_crimes.select(*categorical_columns)

print("Numerical columns:", numerical_columns)
print("Categorical columns:", categorical_columns)

Numerical columns: ['Year', 'Month', 'Day', 'Hour', 'Minute', 'Primary Type Num', 'Location Description Num', 'Arrest', 'Domestic', 'Beat', 'District', 'Ward', 'Community Area']
Categorical columns: ['Primary Type', 'IUCR', 'Location Description', 'FBI Code']


#### Statistics of numerical variables

In [101]:
numerical_df.describe().show()

+-------+-----------------+-----------------+------------------+------------------+------------------+------------------+------------------------+-------------------+------------------+------------------+------------------+------------------+------------------+
|summary|             Year|            Month|               Day|              Hour|            Minute|  Primary Type Num|Location Description Num|             Arrest|          Domestic|              Beat|          District|              Ward|    Community Area|
+-------+-----------------+-----------------+------------------+------------------+------------------+------------------+------------------------+-------------------+------------------+------------------+------------------+------------------+------------------+
|  count|          7474272|          7474272|           7474272|           7474272|           7474272|           7474272|                 7474272|            7474272|           7474272|           7474272|          

In [None]:
categorical_df.describe().show()

### Now, for a more in depth analysis

In [21]:
#df_crime = spark.read.parquet('../Datasets/3_crimes_cleaned')
#df_crime.show(10)

df_crimes_cleaned.head(10)

[Row(ID=11037294, Case Number='JA371270', Date='03/18/2015 12:00:00 PM', Block='0000X W WACKER DR', IUCR='1153', Primary Type='DECEPTIVE PRACTICE', Description='FINANCIAL IDENTITY THEFT OVER $ 300', Location Description='BANK', Arrest=False, Domestic=False, Beat=111, District=1, Ward=42, Community Area=32, FBI Code='11', Year=2015, Updated On='08/01/2017 03:52:26 PM'),
 Row(ID=11646293, Case Number='JC213749', Date='12/20/2018 03:00:00 PM', Block='023XX N LOCKWOOD AVE', IUCR='1154', Primary Type='DECEPTIVE PRACTICE', Description='FINANCIAL IDENTITY THEFT $300 AND UNDER', Location Description='APARTMENT', Arrest=False, Domestic=False, Beat=2515, District=25, Ward=36, Community Area=19, FBI Code='11', Year=2018, Updated On='04/06/2019 04:04:43 PM'),
 Row(ID=11645959, Case Number='JC211511', Date='12/20/2018 04:00:00 PM', Block='045XX N ALBANY AVE', IUCR='2820', Primary Type='OTHER OFFENSE', Description='TELEPHONE THREAT', Location Description='RESIDENCE', Arrest=False, Domestic=False, Be

In [25]:
df_crime.select('Date').show(10, truncate=False)

+----------------------+
|Date                  |
+----------------------+
|10/16/2021 06:55:00 PM|
|08/18/2021 12:00:00 PM|
|08/23/2021 02:30:00 PM|
|02/01/2022 07:00:00 PM|
|10/05/2021 04:55:00 AM|
|09/12/2021 02:00:00 AM|
|08/07/2021 06:00:00 PM|
|11/14/2021 08:01:00 PM|
|01/16/2022 05:10:00 PM|
|02/15/2022 11:30:00 PM|
+----------------------+
only showing top 10 rows



In [None]:


# Convert the Date column to timestamp and extract components
df_crime = df_crime.withColumn('Timestamp', F.to_timestamp(df_crime['Date'], 'MM/dd/yyyy hh:mm:ss a'))
df_crime = df_crime.withColumn('Hour', F.hour(df_crime['Timestamp']))
df_crime = df_crime.withColumn('Minute', F.minute(df_crime['Timestamp']))
df_crime = df_crime.withColumn('Day', F.dayofmonth(df_crime['Timestamp']))
df_crime = df_crime.withColumn('Month', F.month(df_crime['Timestamp']))

# Show the updated DataFrame
df_crime.select('Date', 'Hour', 'Minute', 'Day', 'Month').show(10, truncate=False)

+----------------------+----+------+---+-----+----+
|Date                  |Hour|Minute|Day|Month|Year|
+----------------------+----+------+---+-----+----+
|10/16/2021 06:55:00 PM|18  |55    |16 |10   |2021|
|08/18/2021 12:00:00 PM|12  |0     |18 |8    |2021|
|08/23/2021 02:30:00 PM|14  |30    |23 |8    |2021|
|02/01/2022 07:00:00 PM|19  |0     |1  |2    |2022|
|10/05/2021 04:55:00 AM|4   |55    |5  |10   |2021|
|09/12/2021 02:00:00 AM|2   |0     |12 |9    |2021|
|08/07/2021 06:00:00 PM|18  |0     |7  |8    |2021|
|11/14/2021 08:01:00 PM|20  |1     |14 |11   |2021|
|01/16/2022 05:10:00 PM|17  |10    |16 |1    |2022|
|02/15/2022 11:30:00 PM|23  |30    |15 |2    |2022|
+----------------------+----+------+---+-----+----+
only showing top 10 rows



### Now we're gonna save the the prepared dataset for the next steps to avoid "connection refused" errors down the line

In [None]:

# Create a smaller sapling of data and saving it too

seed_to_use = 5
with_replacement = False
sampling_fraction = 0.3
df_crimes_small = df_crimes.sample(
                        withReplacement=with_replacement,
                        fraction=sampling_fraction,
                        seed=seed_to_use
)

df_crimes.write.mode('overwrite').parquet('../Datasets/3_crimes_cleaned')
df_crimes_small.write.mode('overwrite').parquet('../Datasets/3_crimes_cleaned_small')