## Reading the previously analized and prepared dataset

In [15]:
# Imports
import pyspark
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T

from pyspark.ml import Pipeline
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder
from pyspark.ml.classification import LinearSVC
from pyspark.ml.evaluation import BinaryClassificationEvaluator

import plotly.express as px

In [16]:
# Build SparkSession
spark = SparkSession.builder.appName("BinaryClassificationB").getOrCreate()

In [17]:
# Reading data
data_dir = '../Datasets/'
file_crimes = data_dir + '3_crimes_cleaned'

In [18]:
df_clean = spark.read.parquet(file_crimes)

In [19]:
# Checking data
print(f'df_clean - number of rows: {df_clean.count()}')
df_clean.printSchema()
df_clean.show(10)

df_clean - number of rows: 7474272
root
 |-- Year: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- Day: integer (nullable = true)
 |-- Hour: integer (nullable = true)
 |-- Minute: integer (nullable = true)
 |-- IUCR_Num: integer (nullable = true)
 |-- IUCR: string (nullable = true)
 |-- Primary_Type_Num: integer (nullable = true)
 |-- Primary_Type: string (nullable = true)
 |-- Location_Description_Num: integer (nullable = true)
 |-- Location_Description: string (nullable = true)
 |-- Arrest: integer (nullable = true)
 |-- Domestic: integer (nullable = true)
 |-- Beat: integer (nullable = true)
 |-- District: integer (nullable = true)
 |-- Ward: integer (nullable = true)
 |-- Community_Area: integer (nullable = true)
 |-- FBI_Code: string (nullable = true)
 |-- FBI_Code_Num: integer (nullable = true)

+----+-----+---+----+------+--------+----+----------------+-------------------+------------------------+--------------------+------+--------+----+--------+----+------

Since we already indexed the relevant categorical columns, we can skip the String Indexer phase of the pipeline we are creating.

The following columns were already indexed the previous notebook:

IUCR

Primary_Type

Location_Description

Arrest

Domestic

FBI_Code_Num

In [20]:
cols_categorical = ['IUCR_Num', 'Primary_Type_Num', 'Location_Description_Num', 'Arrest', 'Domestic', 'Beat',  'District', 'Ward', 'Community_Area', 'FBI_Code', 'FBI_Code_Num']

cols_numeric = [col for col in df_clean.columns if col not in cols_categorical]

In [21]:
print(f'Categorical columns: {cols_categorical}')
print(f'Numeric columns: {cols_numeric}')

Categorical columns: ['IUCR_Num', 'Primary_Type_Num', 'Location_Description_Num', 'Arrest', 'Domestic', 'Beat', 'District', 'Ward', 'Community_Area', 'FBI_Code', 'FBI_Code_Num']
Numeric columns: ['Year', 'Month', 'Day', 'Hour', 'Minute', 'IUCR', 'Primary_Type', 'Location_Description']


In [22]:
cols_not_features = ['IUCR', 'Primary_Type', 'Location_Description', 'FBI_Code', 'Arrest']

categorical_cols = [i for i in cols_categorical if i not in cols_not_features]
non_categorical_cols = [i for i in cols_numeric if i not in cols_not_features]
ohe_output_cols = [x + ' OHE' for x in categorical_cols]


In [23]:
ohe_encoder = OneHotEncoder(inputCols=categorical_cols, outputCols=ohe_output_cols)
assembler_inputs = ohe_output_cols + non_categorical_cols
vec_assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="features")

print(f'Input features to be used (OHE were categorical):\n{assembler_inputs}')

Input features to be used (OHE were categorical):
['IUCR_Num OHE', 'Primary_Type_Num OHE', 'Location_Description_Num OHE', 'Domestic OHE', 'Beat OHE', 'District OHE', 'Ward OHE', 'Community_Area OHE', 'FBI_Code_Num OHE', 'Year', 'Month', 'Day', 'Hour', 'Minute']


In [24]:
df_train, df_validation = df_clean.randomSplit([0.7, 0.3], 42)

print(f'There are {df_train.count()} rows in the training set and {df_validation.count()} rows in the validation set.')

There are 5232994 rows in the training set and 2241278 rows in the validation set.


In [30]:
df_train.write.mode('overwrite').parquet('../Datasets/crimes-small-train')
df_validation.write.mode('overwrite').parquet('../Datasets/crimes-small-validation')

In [37]:
if 'df_clean' in locals():
    del df_clean

In [38]:
lsvc = LinearSVC(maxIter=10, regParam=0.1, labelCol='Arrest')

In [39]:
pipeline = Pipeline(stages=[ohe_encoder, vec_assembler, lsvc])

In [42]:
pipeline.write().overwrite().save('../Datasets/pipeline-LinearSVM')