## Reading the previously analized and prepared dataset

In [1]:
# Imports
import pyspark
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T

from pyspark.ml import Pipeline
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder
from pyspark.ml.classification import LinearSVC, LogisticRegression, GBTClassifier, RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

import plotly.express as px

In [2]:
# Build SparkSession
spark = SparkSession.builder \
    .appName("CrimesFix") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "8g") \
    .config("spark.sql.shuffle.partitions", "16") \
    .getOrCreate()

In [3]:
# Reading data
data_dir = '../Datasets/'
file_crimes = data_dir + '3_crimes_cleaned'
df_clean = spark.read.parquet(file_crimes)

In [4]:
# Checking data
print(f'Clean dataframe - number of rows: {df_clean.count()}')
df_clean = df_clean.drop('IUCR', 'Primary_Type', 'Location_Description', 'FBI_Code')
df_clean.printSchema()
df_clean.show(10)

Clean dataframe - number of rows: 2300084
root
 |-- Year: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- Day: integer (nullable = true)
 |-- Hour: integer (nullable = true)
 |-- Minute: integer (nullable = true)
 |-- IUCR_Num: integer (nullable = true)
 |-- Primary_Type_Num: integer (nullable = true)
 |-- Location_Description_Num: integer (nullable = true)
 |-- Arrest: integer (nullable = true)
 |-- Domestic: integer (nullable = true)
 |-- Beat: integer (nullable = true)
 |-- District: integer (nullable = true)
 |-- Ward: integer (nullable = true)
 |-- Community_Area: integer (nullable = true)
 |-- FBI_Code_Num: integer (nullable = true)

+----+-----+---+----+------+--------+----------------+------------------------+------+--------+----+--------+----+--------------+------------+
|Year|Month|Day|Hour|Minute|IUCR_Num|Primary_Type_Num|Location_Description_Num|Arrest|Domestic|Beat|District|Ward|Community_Area|FBI_Code_Num|
+----+-----+---+----+------+--------+--------

## Feature engineering for ML model

### Create lists of categorical and numerical columns

In [None]:
# Categorical attributes
# (will be used for OneHotEncoder)
categorical_cols = [
    'IUCR_Num', 'Primary_Type_Num', 'Location_Description_Num', 'Domestic',
    'Beat', 'District', 'Ward', 'Community_Area', 'FBI_Code_Num'
]

numeric_cols = ['Year', 'Month', 'Day', 'Hour', 'Minute']

# Target column
target_col = 'Arrest'

# Column names for OneHotEncoder output
ohe_output_cols = [f'{c}_OHE' for c in categorical_cols]

# OneHotEncoder for categorical features
ohe_encoder = OneHotEncoder(
    inputCols=categorical_cols,
    outputCols=ohe_output_cols,
    handleInvalid="keep"
)

# Collecting all the features for ML model into a vector
assembler_inputs = ohe_output_cols + numeric_cols

vec_assembler = VectorAssembler(
    inputCols=assembler_inputs,
    outputCol="features"
)

# Checking the columns
print(f'Categorical columns: {categorical_cols}')
print(f'Numeric columns: {numeric_cols}')
print(f'OHE columns: {ohe_output_cols}')
print(f'Assembler inputs: {assembler_inputs}')

Categorical columns: ['IUCR_Num', 'Primary_Type_Num', 'Location_Description_Num', 'Domestic', 'Beat', 'District', 'Ward', 'Community_Area', 'FBI_Code_Num']
Numeric columns: ['Year', 'Month', 'Day', 'Hour', 'Minute']
OHE columns: ['IUCR_Num_OHE', 'Primary_Type_Num_OHE', 'Location_Description_Num_OHE', 'Domestic_OHE', 'Beat_OHE', 'District_OHE', 'Ward_OHE', 'Community_Area_OHE', 'FBI_Code_Num_OHE']
Assembler inputs: ['IUCR_Num_OHE', 'Primary_Type_Num_OHE', 'Location_Description_Num_OHE', 'Domestic_OHE', 'Beat_OHE', 'District_OHE', 'Ward_OHE', 'Community_Area_OHE', 'FBI_Code_Num_OHE', 'Year', 'Month', 'Day', 'Hour', 'Minute']


### Splitting the  dataset into training and test sets

Before splitting the dataset, it is important to check the distribution of the target variable to ensure that the classes are reasonably balanced. \
This helps prevent the model from becoming biased toward the majority class and ensures it can effectively learn to predict both classes.

In [7]:
total_count = df_clean.count()

df_clean.groupBy('Arrest') \
    .agg(
        F.count('*').alias('count'),
        (F.count('*') / total_count * 100).alias('percentage')
    ) \
    .orderBy('Arrest') \
    .show()

+------+-------+------------------+
|Arrest|  count|        percentage|
+------+-------+------------------+
|     0|1779396| 77.36221807551377|
|     1| 520688|22.637781924486237|
+------+-------+------------------+



In [None]:
df_train, df_test = df_clean.randomSplit([0.7, 0.3], 42)

print(f'There are {df_train.count()} rows in the training set and {df_test.count()} rows in the validation set.')

There are 1609952 rows in the training set and 690132 rows in the validation set.


The target variable `Arrest` is imbalanced (77% — class 0, 23% — class 1). To prevent the model from being biased towards the majority class, class weights were applied. \
This improves the model’s ability to correctly predict both classes without losing data or causing overfitting.

In [10]:
# Calculating class balance
count_0 = df_train.filter(df_train.Arrest == 0).count()
count_1 = df_train.filter(df_train.Arrest == 1).count()

balance_ratio = count_0 / count_1

print(f'Class 0 count: {count_0}, Class 1 count: {count_1}')
print(f'Balance ratio: {balance_ratio}')

# Adding a column with weights
df_train_weighted = df_train.withColumn(
    "classWeightCol",
    F.when(F.col("Arrest") == 1, balance_ratio).otherwise(1.0)
)

Class 0 count: 1245762, Class 1 count: 364190
Balance ratio: 3.420637579285538


### Create parquet files with dataframes

In [13]:
df_train_weighted.write.mode('overwrite').parquet('../Datasets/crimes-small-train')
df_test.write.mode('overwrite').parquet('../Datasets/crimes-small-test')

## Model creation

### Linear SVM Model

In [12]:
# Linear SVM model initialization
lsvc = LinearSVC(
    maxIter=10,
    regParam=0.1,
    labelCol="Arrest",
    weightCol="classWeightCol"
)

# Pipeline
pipeline = Pipeline(stages=[ohe_encoder, vec_assembler, lsvc])

# Saving the pipeline
pipeline.write().overwrite().save("../Datasets/pipeline-LinearSVM")

# Fitting the model
model = pipeline.fit(df_train_weighted)

# Saving the model
model.write().overwrite().save("model-LinearSVM")

### Logistic Regression Classifier

In [14]:
logreg = LogisticRegression(
    maxIter=10, 
    regParam=0.1, 
    labelCol='Arrest', 
    weightCol="classWeightCol"
)
pipeline_logreg = Pipeline(stages=[ohe_encoder, vec_assembler, logreg])
pipeline_logreg.write().overwrite().save('../Datasets/pipeline-LogReg')
model_logreg = pipeline_logreg.fit(df_train_weighted)
model_logreg.write().overwrite().save('model-LogReg')

### Random Forest Classifier

In [15]:
rf = RandomForestClassifier(
    labelCol='Arrest', 
    featuresCol='features', 
    numTrees=100,
    weightCol="classWeightCol"
)
pipeline_rf = Pipeline(stages=[ohe_encoder, vec_assembler, rf])
pipeline_rf.write().overwrite().save('../Datasets/pipeline-RandomForest')
model_rf = pipeline_rf.fit(df_train_weighted)
model_rf.write().overwrite().save('model-RandomForest')