## Reading the previously analized and prepared dataset

In [1]:
# Imports
import pyspark
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T

from pyspark.ml import Pipeline
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder
from pyspark.ml.classification import LinearSVC
from pyspark.ml.evaluation import BinaryClassificationEvaluator

import plotly.express as px

In [2]:
# Build SparkSession
spark = SparkSession.builder.appName("BinaryClassificationB").getOrCreate()

In [3]:
# Reading data
data_dir = '../Datasets/'
file_crimes = data_dir + '3_crimes_cleaned'

In [4]:
df_clean = spark.read.parquet(file_crimes)

In [5]:
# Checking data
print(f'df_clean - number of rows: {df_clean.count()}')
df_clean.printSchema()
df_clean.show(10)

df_clean - number of rows: 7474272
root
 |-- Year: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- Day: integer (nullable = true)
 |-- Hour: integer (nullable = true)
 |-- Minute: integer (nullable = true)
 |-- IUCR_Num: integer (nullable = true)
 |-- IUCR: string (nullable = true)
 |-- Primary_Type_Num: integer (nullable = true)
 |-- Primary_Type: string (nullable = true)
 |-- Location_Description_Num: integer (nullable = true)
 |-- Location_Description: string (nullable = true)
 |-- Arrest: integer (nullable = true)
 |-- Domestic: integer (nullable = true)
 |-- Beat: integer (nullable = true)
 |-- District: integer (nullable = true)
 |-- Ward: integer (nullable = true)
 |-- Community_Area: integer (nullable = true)
 |-- FBI_Code: string (nullable = true)

+----+-----+---+----+------+--------+----+----------------+-------------------+------------------------+--------------------+------+--------+----+--------+----+--------------+--------+
|Year|Month|Day|Hour|Minut

Since we already indexed the relevant categorical columns, we can skip the String Indexer phase of the pipeline we are creating.

One hot encoder columns:
IUCR_Num
Primary_Type_Num
Location_Description_Num
Arrest
Domestic
Beat
District
Ward
Community_Area
FBI_Code