## Reading the previously analized and prepared dataset

In [1]:
# Imports
import pyspark
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T

from pyspark.ml import Pipeline
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder
from pyspark.ml.classification import LinearSVC
from pyspark.ml.evaluation import BinaryClassificationEvaluator

import plotly.express as px

In [2]:
# Build SparkSession
spark = SparkSession.builder \
    .appName("CrimesFix") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "8g") \
    .config("spark.sql.shuffle.partitions", "16") \
    .getOrCreate()

In [3]:
# Reading data
data_dir = '../Datasets/'
file_crimes = data_dir + '3_crimes_cleaned'

In [4]:
df_clean = spark.read.parquet(file_crimes)

In [5]:
# Checking data
print(f'df_clean - number of rows: {df_clean.count()}')
df_clean = df_clean.drop('IUCR', 'Primary_Type', 'Location_Description', 'FBI_Code')
df_clean.printSchema()
df_clean.show(10)

df_clean - number of rows: 7474272
root
 |-- Year: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- Day: integer (nullable = true)
 |-- Hour: integer (nullable = true)
 |-- Minute: integer (nullable = true)
 |-- IUCR_Num: integer (nullable = true)
 |-- Primary_Type_Num: integer (nullable = true)
 |-- Location_Description_Num: integer (nullable = true)
 |-- Arrest: integer (nullable = true)
 |-- Domestic: integer (nullable = true)
 |-- Beat: integer (nullable = true)
 |-- District: integer (nullable = true)
 |-- Ward: integer (nullable = true)
 |-- Community_Area: integer (nullable = true)
 |-- FBI_Code_Num: integer (nullable = true)

+----+-----+---+----+------+--------+----------------+------------------------+------+--------+----+--------+----+--------------+------------+
|Year|Month|Day|Hour|Minute|IUCR_Num|Primary_Type_Num|Location_Description_Num|Arrest|Domestic|Beat|District|Ward|Community_Area|FBI_Code_Num|
+----+-----+---+----+------+--------+---------------

Since we already indexed the relevant categorical columns, we can skip the String Indexer phase of the pipeline we are creating.

The following columns were already indexed the previous notebook:

IUCR_Num

Primary_Type_Num

Location_Description_Num

FBI_Code_Num_Num

In [6]:
cols_categorical = ['IUCR_Num', 'Primary_Type_Num', 'Location_Description_Num', 'Arrest', 'Domestic', 'Beat',  'District', 'Ward', 'Community_Area', 'FBI_Code_Num']

cols_numeric = [col for col in df_clean.columns if col not in cols_categorical]

In [7]:
print(f'Categorical columns: {cols_categorical}')
print(f'Numeric columns: {cols_numeric}')

Categorical columns: ['IUCR_Num', 'Primary_Type_Num', 'Location_Description_Num', 'Arrest', 'Domestic', 'Beat', 'District', 'Ward', 'Community_Area', 'FBI_Code_Num']
Numeric columns: ['Year', 'Month', 'Day', 'Hour', 'Minute']


In [8]:
cols_not_features = ['Arrest']


categorical_cols = [i for i in cols_categorical if i not in cols_not_features]
print(f'Categorical columns: {categorical_cols}')

Categorical columns: ['IUCR_Num', 'Primary_Type_Num', 'Location_Description_Num', 'Domestic', 'Beat', 'District', 'Ward', 'Community_Area', 'FBI_Code_Num']


In [9]:

non_categorical_cols = [i for i in cols_numeric if i not in cols_not_features]
ohe_output_cols = [x + ' OHE' for x in categorical_cols]


In [10]:
ohe_encoder = OneHotEncoder(inputCols=categorical_cols, outputCols=ohe_output_cols, handleInvalid="keep")
assembler_inputs = ohe_output_cols + non_categorical_cols
print(f'Assembler inputs: {assembler_inputs}')

Assembler inputs: ['IUCR_Num OHE', 'Primary_Type_Num OHE', 'Location_Description_Num OHE', 'Domestic OHE', 'Beat OHE', 'District OHE', 'Ward OHE', 'Community_Area OHE', 'FBI_Code_Num OHE', 'Year', 'Month', 'Day', 'Hour', 'Minute']


In [11]:

vec_assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="features")


In [12]:
df_train, df_validation = df_clean.randomSplit([0.7, 0.3], 42)

print(f'There are {df_train.count()} rows in the training set and {df_validation.count()} rows in the validation set.')

There are 5232322 rows in the training set and 2241950 rows in the validation set.


In [13]:
df_train.write.mode('overwrite').parquet('../Datasets/crimes-small-train')
df_validation.write.mode('overwrite').parquet('../Datasets/crimes-small-validation')

In [14]:
if 'df_clean' in locals():
    del df_clean

In [15]:
lsvc = LinearSVC(maxIter=10, regParam=0.1, labelCol='Arrest')

In [16]:
pipeline = Pipeline(stages=[ohe_encoder, vec_assembler, lsvc])

In [17]:
pipeline.write().overwrite().save('../Datasets/pipeline-LinearSVM')

In [18]:
df_train.show(10)

+----+-----+---+----+------+--------+----------------+------------------------+------+--------+----+--------+----+--------------+------------+
|Year|Month|Day|Hour|Minute|IUCR_Num|Primary_Type_Num|Location_Description_Num|Arrest|Domestic|Beat|District|Ward|Community_Area|FBI_Code_Num|
+----+-----+---+----+------+--------+----------------+------------------------+------+--------+----+--------+----+--------------+------------+
|2001|    1|  1|   0|     0|      26|               8|                       4|     0|       0|2424|      24|  49|             1|           8|
|2001|    1|  1|   0|     0|      27|               0|                       1|     0|       0| 522|       5|  34|            49|           0|
|2001|    1|  1|   0|     0|      27|               0|                       1|     0|       0|2222|      22|  21|            71|           0|
|2001|    1|  1|   0|     0|      27|               0|                       2|     0|       0|2024|      20|  48|             3|           0|

In [19]:
limit_rows = 100000
model = pipeline.fit(df_train.limit(limit_rows))

In [20]:
model.write().overwrite().save('model-LinearSVM')

# DELETE BELOW

In [21]:
single_row = df_validation.limit(1).collect()[0]
print(single_row)

Row(Year=2001, Month=1, Day=1, Hour=0, Minute=0, IUCR_Num=27, Primary_Type_Num=0, Location_Description_Num=1, Arrest=0, Domestic=0, Beat=1024, District=10, Ward=24, Community_Area=30, FBI_Code_Num=0)


In [23]:
# Convert single_row to a DataFrame before transforming

#df_predictions = model.transform(df_validation)

df_single_row = spark.createDataFrame([single_row.asDict()])
df_predictions = model.transform(df_single_row)

# Check its schema
df_predictions.printSchema()

root
 |-- Arrest: long (nullable = true)
 |-- Beat: long (nullable = true)
 |-- Community_Area: long (nullable = true)
 |-- Day: long (nullable = true)
 |-- District: long (nullable = true)
 |-- Domestic: long (nullable = true)
 |-- FBI_Code_Num: long (nullable = true)
 |-- Hour: long (nullable = true)
 |-- IUCR_Num: long (nullable = true)
 |-- Location_Description_Num: long (nullable = true)
 |-- Minute: long (nullable = true)
 |-- Month: long (nullable = true)
 |-- Primary_Type_Num: long (nullable = true)
 |-- Ward: long (nullable = true)
 |-- Year: long (nullable = true)
 |-- IUCR_Num OHE: vector (nullable = true)
 |-- Primary_Type_Num OHE: vector (nullable = true)
 |-- Location_Description_Num OHE: vector (nullable = true)
 |-- Domestic OHE: vector (nullable = true)
 |-- Beat OHE: vector (nullable = true)
 |-- District OHE: vector (nullable = true)
 |-- Ward OHE: vector (nullable = true)
 |-- Community_Area OHE: vector (nullable = true)
 |-- FBI_Code_Num OHE: vector (nullable = tru

In [24]:
df_predictions_eval = df_predictions.select('features', 
                    'rawPrediction', 'prediction', 'Arrest')

binary_evaluator = BinaryClassificationEvaluator(labelCol='Arrest',
                                                rawPredictionCol='rawPrediction',
                                                metricName='areaUnderROC')
    
area_under_ROC = binary_evaluator.evaluate(df_predictions_eval)

# Print out result
print(f'Metric areaUnderROC = {area_under_ROC}')

Metric areaUnderROC = 0.0


In [25]:
# Counting of the kind of predictions made
df_confusion_matrix = df_predictions_eval.groupBy('prediction', 'Arrest').count()
df_confusion_matrix.show()

+----------+------+-----+
|prediction|Arrest|count|
+----------+------+-----+
|       0.0|     0|    1|
+----------+------+-----+



In [26]:
# Compute the confusion matrix
tp = df_confusion_matrix.filter((F.col('prediction')==1.0) & (F.col('Arrest')==1)).first()
tn = df_confusion_matrix.filter((F.col('prediction')==0.0) & (F.col('Arrest')==0)).first()
fp = df_confusion_matrix.filter((F.col('prediction')==1.0) & (F.col('Arrest')==0)).first()
fn = df_confusion_matrix.filter((F.col('prediction')==0.0) & (F.col('Arrest')==1)).first()

confmat = {'TP': 0.0, 'TN': 0.0, 'FP': 0.0, 'FN': 0.0}
if (tp):
    confmat['TP'] = tp['count'] * 1.0
if (tn):
    confmat['TN'] = tn['count'] * 1.0
if (fp):
    confmat['FP'] = fp['count'] * 1.0
if (fn):
    confmat['FN'] = fn['count'] * 1.0

confmat

{'TP': 0.0, 'TN': 1.0, 'FP': 0.0, 'FN': 0.0}

In [27]:
# Based on the confusion matrix, computed the evaluation matrics:
#   accuracy, precision, recall, specifity and F1 score

# PS: Check divisons by 0.0
TP = confmat['TP']
TN = confmat['TN']
FP = confmat['FP']
FN = confmat['FN']

accuracy = (TP + TN) / (TP + TN + FP + FN) if (TP + TN + FP + FN) > 0 else 0.0
precision = TP / (TP + FP) if (TP + FP) > 0 else 0.0
recall = TP / (TP + FN) if (TP + FN) > 0 else 0.0
specifity = TN / (TN + FP) if (TN + FP) > 0 else 0.0
f1score = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0

print('Evaluation metrics based on the confusion matrix:')
print(f' Accuracy = {accuracy}')
print(f' Precision = {precision}')
print(f' Recall = {recall}')
print(f' Specifity = {specifity}')
print(f' F1 score = {f1score}')


Evaluation metrics based on the confusion matrix:
 Accuracy = 1.0
 Precision = 0.0
 Recall = 0.0
 Specifity = 1.0
 F1 score = 0.0
