## Reading the previously analized and prepared dataset

In [1]:
# Imports
import pyspark
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T

from pyspark.ml import Pipeline
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder
from pyspark.ml.classification import LinearSVC, LogisticRegression, GBTClassifier, RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator

import plotly.express as px

In [2]:
# Build SparkSession
spark = SparkSession.builder \
    .appName("CrimesFix") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "8g") \
    .config("spark.sql.shuffle.partitions", "16") \
    .getOrCreate()

----------------------------------------
Exception occurred during processing of request from ('127.0.0.1', 42366)
Traceback (most recent call last):
  File "/opt/conda/envs/vscode_pyspark/lib/python3.11/socketserver.py", line 317, in _handle_request_noblock
    self.process_request(request, client_address)
  File "/opt/conda/envs/vscode_pyspark/lib/python3.11/socketserver.py", line 348, in process_request
    self.finish_request(request, client_address)
  File "/opt/conda/envs/vscode_pyspark/lib/python3.11/socketserver.py", line 361, in finish_request
    self.RequestHandlerClass(request, client_address, self)
  File "/opt/conda/envs/vscode_pyspark/lib/python3.11/socketserver.py", line 755, in __init__
    self.handle()
  File "/opt/conda/envs/vscode_pyspark/lib/python3.11/site-packages/pyspark/accumulators.py", line 295, in handle
    poll(accum_updates)
  File "/opt/conda/envs/vscode_pyspark/lib/python3.11/site-packages/pyspark/accumulators.py", line 267, in poll
    if self.rfile i

In [3]:
# Reading data
data_dir = '../Datasets/'
file_crimes = data_dir + '3_crimes_cleaned'

In [4]:
df_clean = spark.read.parquet(file_crimes)

In [5]:
# Checking data
print(f'df_clean - number of rows: {df_clean.count()}')
df_clean = df_clean.drop('IUCR', 'Primary_Type', 'Location_Description', 'FBI_Code')
df_clean.printSchema()
df_clean.show(10)

df_clean - number of rows: 7474272
root
 |-- Year: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- Day: integer (nullable = true)
 |-- Hour: integer (nullable = true)
 |-- Minute: integer (nullable = true)
 |-- IUCR_Num: integer (nullable = true)
 |-- Primary_Type_Num: integer (nullable = true)
 |-- Location_Description_Num: integer (nullable = true)
 |-- Arrest: integer (nullable = true)
 |-- Domestic: integer (nullable = true)
 |-- Beat: integer (nullable = true)
 |-- District: integer (nullable = true)
 |-- Ward: integer (nullable = true)
 |-- Community_Area: integer (nullable = true)
 |-- FBI_Code_Num: integer (nullable = true)

+----+-----+---+----+------+--------+----------------+------------------------+------+--------+----+--------+----+--------------+------------+
|Year|Month|Day|Hour|Minute|IUCR_Num|Primary_Type_Num|Location_Description_Num|Arrest|Domestic|Beat|District|Ward|Community_Area|FBI_Code_Num|
+----+-----+---+----+------+--------+---------------

Since we already indexed the relevant categorical columns, we can skip the String Indexer phase of the pipeline we are creating.

The following columns were already indexed the previous notebook:

IUCR_Num

Primary_Type_Num

Location_Description_Num

FBI_Code_Num_Num

In [6]:
cols_categorical = ['IUCR_Num', 'Primary_Type_Num', 'Location_Description_Num', 'Arrest', 'Domestic', 'Beat',  'District', 'Ward', 'Community_Area', 'FBI_Code_Num']

cols_numeric = [col for col in df_clean.columns if col not in cols_categorical]

In [7]:
print(f'Categorical columns: {cols_categorical}')
print(f'Numeric columns: {cols_numeric}')

Categorical columns: ['IUCR_Num', 'Primary_Type_Num', 'Location_Description_Num', 'Arrest', 'Domestic', 'Beat', 'District', 'Ward', 'Community_Area', 'FBI_Code_Num']
Numeric columns: ['Year', 'Month', 'Day', 'Hour', 'Minute']


In [8]:
cols_not_features = ['Arrest']


categorical_cols = [i for i in cols_categorical if i not in cols_not_features]
print(f'Categorical columns: {categorical_cols}')

Categorical columns: ['IUCR_Num', 'Primary_Type_Num', 'Location_Description_Num', 'Domestic', 'Beat', 'District', 'Ward', 'Community_Area', 'FBI_Code_Num']


In [9]:

non_categorical_cols = [i for i in cols_numeric if i not in cols_not_features]
ohe_output_cols = [x + ' OHE' for x in categorical_cols]


In [10]:
ohe_encoder = OneHotEncoder(inputCols=categorical_cols, outputCols=ohe_output_cols, handleInvalid="keep")
assembler_inputs = ohe_output_cols + non_categorical_cols
print(f'Assembler inputs: {assembler_inputs}')

Assembler inputs: ['IUCR_Num OHE', 'Primary_Type_Num OHE', 'Location_Description_Num OHE', 'Domestic OHE', 'Beat OHE', 'District OHE', 'Ward OHE', 'Community_Area OHE', 'FBI_Code_Num OHE', 'Year', 'Month', 'Day', 'Hour', 'Minute']


In [11]:

vec_assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="features")


In [12]:
df_train, df_validation = df_clean.randomSplit([0.7, 0.3], 42)

print(f'There are {df_train.count()} rows in the training set and {df_validation.count()} rows in the validation set.')

There are 5232322 rows in the training set and 2241950 rows in the validation set.


In [13]:
df_train.write.mode('overwrite').parquet('../Datasets/crimes-small-train')
df_validation.write.mode('overwrite').parquet('../Datasets/crimes-small-validation')

In [14]:
if 'df_clean' in locals():
    del df_clean

In [15]:
lsvc = LinearSVC(maxIter=10, regParam=0.1, labelCol='Arrest')

In [16]:
pipeline = Pipeline(stages=[ohe_encoder, vec_assembler, lsvc])

In [17]:
pipeline.write().overwrite().save('../Datasets/pipeline-LinearSVM')

In [18]:
limit_rows = 100000
model = pipeline.fit(df_train.limit(limit_rows))

In [19]:
model.write().overwrite().save('model-LinearSVM')

## Lets make more models

In [None]:

# Logistic Regression Classifier
logreg = LogisticRegression(maxIter=10, regParam=0.1, labelCol='Arrest')
pipeline_logreg = Pipeline(stages=[ohe_encoder, vec_assembler, logreg])
pipeline_logreg.write().overwrite().save('../Datasets/pipeline-LogReg')
model_logreg = pipeline_logreg.fit(df_train.limit(limit_rows))
model_logreg.write().overwrite().save('model-LogReg')


In [None]:

# Random Forest Classifier
rf = RandomForestClassifier(labelCol='Arrest', featuresCol='features', numTrees=20)
pipeline_rf = Pipeline(stages=[ohe_encoder, vec_assembler, rf])
pipeline_rf.write().overwrite().save('../Datasets/pipeline-RandomForest')
model_rf = pipeline_rf.fit(df_train.limit(limit_rows))
model_rf.write().overwrite().save('model-RandomForest')
