# Data classification using ML models

In this notebooke we will use the ML models we saved in the previous notebook to classify batches of rows and figure out the optimal model for our data.

Afterwards, we'll use the winning model for the data classification in a stream format

In [30]:
# Imports
import pyspark
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T

from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.stat import Correlation
from pyspark.ml.feature import VectorAssembler, StringIndexer, OneHotEncoder
from pyspark.ml.classification import LinearSVC, LogisticRegression, GBTClassifier, RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator


In [31]:
# Build SparkSession
spark = SparkSession.builder \
    .appName("CrimesFix") \
    .config("spark.driver.memory", "8g") \
    .config("spark.executor.memory", "8g") \
    .config("spark.sql.shuffle.partitions", "16") \
    .getOrCreate()

In [32]:
df_validation = spark.read.parquet("../Datasets/crimes-small-validation")

### Lets start evaluating the linear model

In [33]:
model_linear = PipelineModel.load('model-LinearSVM')

In [34]:

df_predictions = model_linear.transform(df_validation)

df_predictions_eval = df_predictions.select('features', 
                    'rawPrediction', 'prediction', 'Arrest')

binary_evaluator = BinaryClassificationEvaluator(labelCol='Arrest',
                                                rawPredictionCol='rawPrediction',
                                                metricName='areaUnderROC')
    
area_under_ROC_linear = binary_evaluator.evaluate(df_predictions_eval)

# Print out result
print(f'Metric areaUnderROC of the linear model = {area_under_ROC_linear}')

Metric areaUnderROC of the linear model = 0.8672023696346972


In [35]:
# Counting of the kind of predictions made
df_confusion_matrix = df_predictions_eval.groupBy('prediction', 'Arrest').count()
df_confusion_matrix.show()

+----------+------+-------+
|prediction|Arrest|  count|
+----------+------+-------+
|       0.0|     0|1596941|
|       1.0|     1| 368949|
|       1.0|     0|  77815|
|       0.0|     1| 198245|
+----------+------+-------+



In [36]:
# Compute the confusion matrix
tp = df_confusion_matrix.filter((F.col('prediction')==1.0) & (F.col('Arrest')==1)).first()
tn = df_confusion_matrix.filter((F.col('prediction')==0.0) & (F.col('Arrest')==0)).first()
fp = df_confusion_matrix.filter((F.col('prediction')==1.0) & (F.col('Arrest')==0)).first()
fn = df_confusion_matrix.filter((F.col('prediction')==0.0) & (F.col('Arrest')==1)).first()

confmat = {'TP': 0.0, 'TN': 0.0, 'FP': 0.0, 'FN': 0.0}
if (tp):
    confmat['TP'] = tp['count'] * 1.0
if (tn):
    confmat['TN'] = tn['count'] * 1.0
if (fp):
    confmat['FP'] = fp['count'] * 1.0
if (fn):
    confmat['FN'] = fn['count'] * 1.0



In [37]:
# Based on the confusion matrix, computed the evaluation matrics:
#   accuracy, precision, recall, specifity and F1 score

# PS: Check divisons by 0.0
TP = confmat['TP']
TN = confmat['TN']
FP = confmat['FP']
FN = confmat['FN']

accuracy_linear = (TP + TN) / (TP + TN + FP + FN) if (TP + TN + FP + FN) > 0 else 0.0
precision_linear = TP / (TP + FP) if (TP + FP) > 0 else 0.0
recall_linear = TP / (TP + FN) if (TP + FN) > 0 else 0.0
specifity_linear = TN / (TN + FP) if (TN + FP) > 0 else 0.0
f1score_linear = 2 * precision_linear * recall_linear / (precision_linear + recall_linear) if (precision_linear + recall_linear) > 0 else 0.0

print('Evaluation metrics based on the confusion matrix:')
print(f' Linear Accuracy = {accuracy_linear}')
print(f' Linear Precision = {precision_linear}')
print(f' Linear Recall = {recall_linear}')
print(f' Linear Specifity = {specifity_linear}')
print(f' Linear F1 score = {f1score_linear}')


Evaluation metrics based on the confusion matrix:
 Linear Accuracy = 0.8768661210107273
 Linear Precision = 0.8258252679266906
 Linear Recall = 0.6504811404916131
 Linear Specifity = 0.9535365151699711
 Linear F1 score = 0.7277402022568982


### Logistic Regression Classifier

In [38]:
model_logreg = PipelineModel.load('model-LogReg')

In [39]:
df_predictions = model_logreg.transform(df_validation)

df_predictions_eval = df_predictions.select('features', 
                    'rawPrediction', 'prediction', 'Arrest')

binary_evaluator = BinaryClassificationEvaluator(labelCol='Arrest',
                                                rawPredictionCol='rawPrediction',
                                                metricName='areaUnderROC')
    
area_under_ROC_logreg = binary_evaluator.evaluate(df_predictions_eval)

# Print out result
print(f'Metric areaUnderROC of the Logistic Regression model = {area_under_ROC_logreg}')

Metric areaUnderROC of the Logistic Regression model = 0.8984182843426602


In [40]:
# Counting of the kind of predictions made
df_confusion_matrix = df_predictions_eval.groupBy('prediction', 'Arrest').count()
df_confusion_matrix.show()

+----------+------+-------+
|prediction|Arrest|  count|
+----------+------+-------+
|       0.0|     0|1622855|
|       1.0|     1| 355287|
|       1.0|     0|  51901|
|       0.0|     1| 211907|
+----------+------+-------+



In [41]:
# Compute the confusion matrix
tp = df_confusion_matrix.filter((F.col('prediction')==1.0) & (F.col('Arrest')==1)).first()
tn = df_confusion_matrix.filter((F.col('prediction')==0.0) & (F.col('Arrest')==0)).first()
fp = df_confusion_matrix.filter((F.col('prediction')==1.0) & (F.col('Arrest')==0)).first()
fn = df_confusion_matrix.filter((F.col('prediction')==0.0) & (F.col('Arrest')==1)).first()

confmat = {'TP': 0.0, 'TN': 0.0, 'FP': 0.0, 'FN': 0.0}
if (tp):
    confmat['TP'] = tp['count'] * 1.0
if (tn):
    confmat['TN'] = tn['count'] * 1.0
if (fp):
    confmat['FP'] = fp['count'] * 1.0
if (fn):
    confmat['FN'] = fn['count'] * 1.0

In [42]:
# Based on the confusion matrix, computed the evaluation matrics:
#   accuracy, precision, recall, specifity and F1 score

# PS: Check divisons by 0.0
TP = confmat['TP']
TN = confmat['TN']
FP = confmat['FP']
FN = confmat['FN']

accuracy_logreg = (TP + TN) / (TP + TN + FP + FN) if (TP + TN + FP + FN) > 0 else 0.0
precision_logreg = TP / (TP + FP) if (TP + FP) > 0 else 0.0
recall_logreg = TP / (TP + FN) if (TP + FN) > 0 else 0.0
specifity_logreg = TN / (TN + FP) if (TN + FP) > 0 else 0.0
f1score_logreg = 2 * precision_logreg * recall_logreg / (precision_logreg + recall_logreg) if (precision_logreg + recall_logreg) > 0 else 0.0

print('Evaluation metrics based on the confusion matrix:')
print(f' Logistic Regression Accuracy = {accuracy_logreg}')
print(f' Logistic Regression Precision = {precision_logreg}')
print(f' Logistic Regression Recall = {recall_logreg}')
print(f' Logistic Regression Specifity = {specifity_logreg}')
print(f' Logistic Regression F1 score = {f1score_logreg}')


Evaluation metrics based on the confusion matrix:
 Logistic Regression Accuracy = 0.882331006489886
 Logistic Regression Precision = 0.8725379922787508
 Logistic Regression Recall = 0.6263941438026496
 Logistic Regression Specifity = 0.9690098139669301
 Logistic Regression F1 score = 0.7292560823167916


### Random Forest Classifier

In [43]:
model_rf = PipelineModel.load('model-RandomForest')

In [44]:
df_predictions = model_rf.transform(df_validation)

df_predictions_eval = df_predictions.select('features', 
                    'rawPrediction', 'prediction', 'Arrest')

binary_evaluator = BinaryClassificationEvaluator(labelCol='Arrest',
                                                rawPredictionCol='rawPrediction',
                                                metricName='areaUnderROC')
    
area_under_ROC_rf = binary_evaluator.evaluate(df_predictions_eval)

# Print out result
print(f'Metric areaUnderROC of the Random Forest model = {area_under_ROC_rf}')

Metric areaUnderROC of the Random Forest model = 0.8645578453101262


In [45]:
# Counting of the kind of predictions made
df_confusion_matrix = df_predictions_eval.groupBy('prediction', 'Arrest').count()
df_confusion_matrix.show()

+----------+------+-------+
|prediction|Arrest|  count|
+----------+------+-------+
|       0.0|     0|1674395|
|       1.0|     1|  65633|
|       1.0|     0|    361|
|       0.0|     1| 501561|
+----------+------+-------+



In [46]:
# Compute the confusion matrix
tp = df_confusion_matrix.filter((F.col('prediction')==1.0) & (F.col('Arrest')==1)).first()
tn = df_confusion_matrix.filter((F.col('prediction')==0.0) & (F.col('Arrest')==0)).first()
fp = df_confusion_matrix.filter((F.col('prediction')==1.0) & (F.col('Arrest')==0)).first()
fn = df_confusion_matrix.filter((F.col('prediction')==0.0) & (F.col('Arrest')==1)).first()

confmat = {'TP': 0.0, 'TN': 0.0, 'FP': 0.0, 'FN': 0.0}
if (tp):
    confmat['TP'] = tp['count'] * 1.0
if (tn):
    confmat['TN'] = tn['count'] * 1.0
if (fp):
    confmat['FP'] = fp['count'] * 1.0
if (fn):
    confmat['FN'] = fn['count'] * 1.0

In [47]:
# Based on the confusion matrix, computed the evaluation matrics:
#   accuracy, precision, recall, specifity and F1 score

# PS: Check divisons by 0.0
TP = confmat['TP']
TN = confmat['TN']
FP = confmat['FP']
FN = confmat['FN']

accuracy_rf = (TP + TN) / (TP + TN + FP + FN) if (TP + TN + FP + FN) > 0 else 0.0
precision_rf = TP / (TP + FP) if (TP + FP) > 0 else 0.0
recall_rf = TP / (TP + FN) if (TP + FN) > 0 else 0.0
specifity_rf = TN / (TN + FP) if (TN + FP) > 0 else 0.0
f1score_rf = 2 * precision_rf * recall_rf / (precision_rf + recall_rf) if (precision_rf + recall_rf) > 0 else 0.0

print('Evaluation metrics based on the confusion matrix:')
print(f' Random Forest Accuracy = {accuracy_rf}')
print(f' Random Forest Precision = {precision_rf}')
print(f' Random Forest Recall = {recall_rf}')
print(f' Random Forest Specifity = {specifity_rf}')
print(f' Random Forest F1 score = {f1score_rf}')

Evaluation metrics based on the confusion matrix:
 Random Forest Accuracy = 0.7761225718682397
 Random Forest Precision = 0.9945298057399158
 Random Forest Recall = 0.11571525791880732
 Random Forest Specifity = 0.9997844462118661
 Random Forest F1 score = 0.20730967737859846


Now to compare the 3 different models

Lookin at Random Forest's recall, we can see its very low so we are excludingit from consideration right away.

Now, comparing logistic regression and linear model:

In [48]:
print(f' Linear Accuracy = {accuracy_linear}')
print(f' Logistic Regression Accuracy = {accuracy_logreg}')
print(" ")
print(f' Linear Precision = {precision_linear}')
print(f' Logistic Regression Precision = {precision_logreg}')
print(" ")
print(f' Linear Recall = {recall_linear}')
print(f' Logistic Regression Recall = {recall_logreg}')
print(" ")
print(f' Linear Specifity = {specifity_linear}')
print(f' Logistic Regression Specifity = {specifity_logreg}')
print(" ")
print(f' Linear F1 score = {f1score_linear}')
print(f' Logistic Regression F1 score = {f1score_logreg}')
print(" ")
print(f'Metric areaUnderROC of the linear model = {area_under_ROC_linear}')
print(f'Metric areaUnderROC of the Logistic Regression model = {area_under_ROC_logreg}')



 Linear Accuracy = 0.8768661210107273
 Logistic Regression Accuracy = 0.882331006489886
 
 Linear Precision = 0.8258252679266906
 Logistic Regression Precision = 0.8725379922787508
 
 Linear Recall = 0.6504811404916131
 Logistic Regression Recall = 0.6263941438026496
 
 Linear Specifity = 0.9535365151699711
 Logistic Regression Specifity = 0.9690098139669301
 
 Linear F1 score = 0.7277402022568982
 Logistic Regression F1 score = 0.7292560823167916
 
Metric areaUnderROC of the linear model = 0.8672023696346972
Metric areaUnderROC of the Logistic Regression model = 0.8984182843426602


As we can see, they are pretty similar so we'll go foward with the Linear model since it was the one we used in class.