In [1]:
import pyspark
spark = pyspark.sql.SparkSession.builder.appName("crime-classification").getOrCreate()
sc = spark.sparkContext

23/12/13 12:05:19 WARN Utils: Your hostname, Shalvis-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 192.168.4.76 instead (on interface en0)
23/12/13 12:05:19 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/12/13 12:05:20 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [2]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
%matplotlib inline
np.random.seed(60)

In [3]:
#Read the data into spark datafrome
from pyspark.sql.functions import col, lower
df = spark.read.format('csv')\
          .option('header','true')\
          .option('inferSchema', 'true')\
          .option('timestamp', 'true')\
          .load('./sf-crime/train.csv')

data = df.select(lower(col('Category')),lower(col('Descript')))\
        .withColumnRenamed('lower(Category)','Category')\
        .withColumnRenamed('lower(Descript)', 'Description')
data.cache()
print('Dataframe Structure')
print('----------------------------------')
print(data.printSchema())
print(' ')
print('Dataframe preview')
print(data.show(5))
print(' ')
print('----------------------------------')
print('Total number of rows', df.count())

                                                                                

Dataframe Structure
----------------------------------
root
 |-- Category: string (nullable = true)
 |-- Description: string (nullable = true)

None
 
Dataframe preview


                                                                                

+--------------+--------------------+
|      Category|         Description|
+--------------+--------------------+
|      warrants|      warrant arrest|
|other offenses|traffic violation...|
|other offenses|traffic violation...|
| larceny/theft|grand theft from ...|
| larceny/theft|grand theft from ...|
+--------------+--------------------+
only showing top 5 rows

None
 
----------------------------------


[Stage 4:>                                                        (0 + 10) / 10]

Total number of rows 878049


                                                                                

In [4]:
def top_n_list(df,var, N):
    '''
    This function determine the top N numbers of the list
    '''
    print("Total number of unique value of"+' '+var+''+':'+' '+str(df.select(var).distinct().count()))
    print(' ')
    print('Top'+' '+str(N)+' '+'Crime'+' '+var)
    df.groupBy(var).count().withColumnRenamed('count','totalValue')\
    .orderBy(col('totalValue').desc()).show(N)
    
    
top_n_list(data, 'Category',10)
print(' ')
print(' ')
top_n_list(data,'Description',10)

                                                                                

Total number of unique value of Category: 39
 
Top 10 Crime Category


                                                                                

+--------------+----------+
|      Category|totalValue|
+--------------+----------+
| larceny/theft|    174900|
|other offenses|    126182|
|  non-criminal|     92304|
|       assault|     76876|
| drug/narcotic|     53971|
| vehicle theft|     53781|
|     vandalism|     44725|
|      warrants|     42214|
|      burglary|     36755|
|suspicious occ|     31414|
+--------------+----------+
only showing top 10 rows

 
 


                                                                                

Total number of unique value of Description: 879
 
Top 10 Crime Description
+--------------------+----------+
|         Description|totalValue|
+--------------------+----------+
|grand theft from ...|     60022|
|       lost property|     31729|
|             battery|     27441|
|   stolen automobile|     26897|
|drivers license, ...|     26839|
|      warrant arrest|     23754|
|suspicious occurr...|     21891|
|aided case, menta...|     21497|
|petty theft from ...|     19771|
|malicious mischie...|     17789|
+--------------------+----------+
only showing top 10 rows



In [5]:
data.select('Category').distinct().count()

39

In [6]:
data_train, data_test = data.randomSplit([0.7,0.3], seed=60)
print("Training Dataset Count:", data_train.count())
print("Test Dataset Count:", data_test.count())

                                                                                

Training Dataset Count: 614020
Test Dataset Count: 264029


In [7]:
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer, OneHotEncoder, StringIndexer, VectorAssembler, HashingTF, IDF, Word2Vec
from pyspark.ml import Pipeline
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator 
from pyspark.ml.classification import LogisticRegression, NaiveBayes 

#----------------Define tokenizer with regextokenizer()------------------
regex_tokenizer = RegexTokenizer(pattern='\\W')\
                  .setInputCol("Description")\
                  .setOutputCol("tokens")

#----------------Define stopwords with stopwordsremover()---------------------
extra_stopwords = ['http','amp','rt','t','c','the']
stopwords_remover = StopWordsRemover()\
                    .setInputCol('tokens')\
                    .setOutputCol('filtered_words')\
                    .setStopWords(extra_stopwords)
                    

#----------Define bags of words using countVectorizer()---------------------------
count_vectors = CountVectorizer()\
               .setInputCol("filtered_words")\
               .setOutputCol("features")



#-----------Encode the Category variable into label using StringIndexer-----------
label_string_idx = StringIndexer()\
                  .setInputCol("Category")\
                  .setOutputCol("label")

#-----------Define classifier structure for logistic Regression--------------
lr = LogisticRegression()



def metrics_ev(labels, metrics):
    '''
    List of all performance metrics
    '''
    # Confusion matrix
    print("---------Confusion matrix-----------------")
    print(metrics.confusionMatrix)
    print(' ')    
    # Overall statistics
    print('----------Overall statistics-----------')
    print("Precision = %s" %  metrics.precision())
    print("Recall = %s" %  metrics.recall())
    print("F1 Score = %s" % metrics.fMeasure())
    print(' ')
    # Statistics by class
    print('----------Statistics by class----------')
    for label in sorted(labels):
       print("Class %s precision = %s" % (label, metrics.precision(label)))
       print("Class %s recall = %s" % (label, metrics.recall(label)))
       print("Class %s F1 Measure = %s" % (label, metrics.fMeasure(label, beta=1.0)))
    print(' ')
    # Weighted stats
    print('----------Weighted stats----------------')
    print("Weighted recall = %s" % metrics.weightedRecall)
    print("Weighted precision = %s" % metrics.weightedPrecision)
    print("Weighted F(1) Score = %s" % metrics.weightedFMeasure())
    print("Weighted F(0.5) Score = %s" % metrics.weightedFMeasure(beta=0.5))
    print("Weighted false positive rate = %s" % metrics.weightedFalsePositiveRate)
    

In [None]:
start_time = time.time()
pipeline_cv_lr = Pipeline().setStages([regex_tokenizer,stopwords_remover,count_vectors,label_string_idx, lr])
model_cv_lr = pipeline_cv_lr.fit(data_train)
predictions_cv_lr = model_cv_lr.transform(data_test)
end_time = time.time()
elapsed_time = end_time - start_time()
print(elapsed_time)

In [11]:
lr_pipeline = Pipeline().setStages([regex_tokenizer, stopwords_remover, count_vectors, label_string_idx, lr])

param_grid = ParamGridBuilder() \
    .addGrid(count_vectors.vocabSize, [5000, 10000, 15000]) \
    .addGrid(count_vectors.minDF, [3, 5, 7]) \
    .addGrid(lr.maxIter, [10, 20, 30]) \
    .addGrid(lr.regParam, [0.1, 0.3, 0.5]) \
    .build()

crossval = CrossValidator(estimator=lr_pipeline,
                          estimatorParamMaps=param_grid,
                          evaluator=MulticlassClassificationEvaluator(),
                          numFolds=5)  
fold_times = []

for i in range(5):  
    print(f"\nTraining Fold {i + 1}")

    start_time = time.time()

    cv_model = crossval.fit(df)  

    end_time = time.time()

    elapsed_time = end_time - start_time
    fold_times.append(elapsed_time)

    print(f"Fold {i + 1} - Best Parameters: {cv_model.bestModel.stages[0].extractParamMap()}")
    print(f"Fold {i + 1} - Elapsed Time: {elapsed_time:.2f} seconds")
$
average_time = sum(fold_times) / len(fold_times)
print(f"\nAverage Time Across Folds: {average_time:.2f} seconds")



Training Fold 1


23/12/12 16:51:59 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
                                                                                

Fold 1 - Best Parameters: {Param(parent='RegexTokenizer_6728291e2d16', name='gaps', doc='whether regex splits on gaps (True) or matches tokens (False)'): True, Param(parent='RegexTokenizer_6728291e2d16', name='minTokenLength', doc='minimum token length (>= 0)'): 1, Param(parent='RegexTokenizer_6728291e2d16', name='outputCol', doc='output column name.'): 'tokens', Param(parent='RegexTokenizer_6728291e2d16', name='pattern', doc='regex pattern (Java dialect) used for tokenizing'): '\\W', Param(parent='RegexTokenizer_6728291e2d16', name='toLowercase', doc='whether to convert all characters to lowercase before tokenizing'): True, Param(parent='RegexTokenizer_6728291e2d16', name='inputCol', doc='input column name.'): 'Description'}
Fold 1 - Elapsed Time: 712.86 seconds

Training Fold 2


Exception ignored in: <function JavaWrapper.__del__ at 0x13fb7b740>             
Traceback (most recent call last):
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/pyspark/ml/wrapper.py", line 53, in __del__
    if SparkContext._active_spark_context and self._java_obj is not None:
                                              ^^^^^^^^^^^^^^
AttributeError: 'CountVectorizer' object has no attribute '_java_obj'
                                                                                

Fold 2 - Best Parameters: {Param(parent='RegexTokenizer_6728291e2d16', name='gaps', doc='whether regex splits on gaps (True) or matches tokens (False)'): True, Param(parent='RegexTokenizer_6728291e2d16', name='minTokenLength', doc='minimum token length (>= 0)'): 1, Param(parent='RegexTokenizer_6728291e2d16', name='outputCol', doc='output column name.'): 'tokens', Param(parent='RegexTokenizer_6728291e2d16', name='pattern', doc='regex pattern (Java dialect) used for tokenizing'): '\\W', Param(parent='RegexTokenizer_6728291e2d16', name='toLowercase', doc='whether to convert all characters to lowercase before tokenizing'): True, Param(parent='RegexTokenizer_6728291e2d16', name='inputCol', doc='input column name.'): 'Description'}
Fold 2 - Elapsed Time: 679.96 seconds

Training Fold 3


                                                                                

Fold 3 - Best Parameters: {Param(parent='RegexTokenizer_6728291e2d16', name='gaps', doc='whether regex splits on gaps (True) or matches tokens (False)'): True, Param(parent='RegexTokenizer_6728291e2d16', name='minTokenLength', doc='minimum token length (>= 0)'): 1, Param(parent='RegexTokenizer_6728291e2d16', name='outputCol', doc='output column name.'): 'tokens', Param(parent='RegexTokenizer_6728291e2d16', name='pattern', doc='regex pattern (Java dialect) used for tokenizing'): '\\W', Param(parent='RegexTokenizer_6728291e2d16', name='toLowercase', doc='whether to convert all characters to lowercase before tokenizing'): True, Param(parent='RegexTokenizer_6728291e2d16', name='inputCol', doc='input column name.'): 'Description'}
Fold 3 - Elapsed Time: 704.53 seconds

Training Fold 4


                                                                                

Fold 4 - Best Parameters: {Param(parent='RegexTokenizer_6728291e2d16', name='gaps', doc='whether regex splits on gaps (True) or matches tokens (False)'): True, Param(parent='RegexTokenizer_6728291e2d16', name='minTokenLength', doc='minimum token length (>= 0)'): 1, Param(parent='RegexTokenizer_6728291e2d16', name='outputCol', doc='output column name.'): 'tokens', Param(parent='RegexTokenizer_6728291e2d16', name='pattern', doc='regex pattern (Java dialect) used for tokenizing'): '\\W', Param(parent='RegexTokenizer_6728291e2d16', name='toLowercase', doc='whether to convert all characters to lowercase before tokenizing'): True, Param(parent='RegexTokenizer_6728291e2d16', name='inputCol', doc='input column name.'): 'Description'}
Fold 4 - Elapsed Time: 708.91 seconds

Training Fold 5


                                                                                

Fold 5 - Best Parameters: {Param(parent='RegexTokenizer_6728291e2d16', name='gaps', doc='whether regex splits on gaps (True) or matches tokens (False)'): True, Param(parent='RegexTokenizer_6728291e2d16', name='minTokenLength', doc='minimum token length (>= 0)'): 1, Param(parent='RegexTokenizer_6728291e2d16', name='outputCol', doc='output column name.'): 'tokens', Param(parent='RegexTokenizer_6728291e2d16', name='pattern', doc='regex pattern (Java dialect) used for tokenizing'): '\\W', Param(parent='RegexTokenizer_6728291e2d16', name='toLowercase', doc='whether to convert all characters to lowercase before tokenizing'): True, Param(parent='RegexTokenizer_6728291e2d16', name='inputCol', doc='input column name.'): 'Description'}
Fold 5 - Elapsed Time: 668.36 seconds

Average Time Across Folds: 694.92 seconds


In [21]:
train_results = cv_model.bestModel.transform(data_train)
test_results = cv_model.bestModel.transform(data_test)

Train predictions

In [22]:
print('-----------------------------Check Top 5 predictions----------------------------------')
print(' ')
train_results.select('Description','Category',"probability","label","prediction")\
                                        .orderBy("probability", ascending=False)\
                                        .show(n=5, truncate=30)

-----------------------------Check Top 5 predictions----------------------------------
 


[Stage 6705:=====>                                                 (1 + 9) / 10]

+------------------------------+-------------+------------------------------+-----+----------+
|                   Description|     Category|                   probability|label|prediction|
+------------------------------+-------------+------------------------------+-----+----------+
|theft, bicycle, <$50, no se...|larceny/theft|[0.8091214496699353,0.02988...|  0.0|       0.0|
|theft, bicycle, <$50, no se...|larceny/theft|[0.8091214496699353,0.02988...|  0.0|       0.0|
|theft, bicycle, <$50, no se...|larceny/theft|[0.8091214496699353,0.02988...|  0.0|       0.0|
|theft, bicycle, <$50, no se...|larceny/theft|[0.8091214496699353,0.02988...|  0.0|       0.0|
|theft, bicycle, <$50, no se...|larceny/theft|[0.8091214496699353,0.02988...|  0.0|       0.0|
+------------------------------+-------------+------------------------------+-----+----------+
only showing top 5 rows



                                                                                

In [23]:
print('-----------------------------Check Top 5 predictions----------------------------------')
print(' ')
test_results.select('Description','Category',"probability","label","prediction")\
                                        .orderBy("probability", ascending=False)\
                                        .show(n=5, truncate=30)

-----------------------------Check Top 5 predictions----------------------------------
 


[Stage 6706:>                                                     (0 + 10) / 10]

+------------------------------+-------------+------------------------------+-----+----------+
|                   Description|     Category|                   probability|label|prediction|
+------------------------------+-------------+------------------------------+-----+----------+
|theft, bicycle, <$50, no se...|larceny/theft|[0.8091214496699353,0.02988...|  0.0|       0.0|
|theft, bicycle, <$50, no se...|larceny/theft|[0.8091214496699353,0.02988...|  0.0|       0.0|
|theft, bicycle, <$50, no se...|larceny/theft|[0.8091214496699353,0.02988...|  0.0|       0.0|
|theft, bicycle, <$50, no se...|larceny/theft|[0.8091214496699353,0.02988...|  0.0|       0.0|
|theft, bicycle, <$50, no se...|larceny/theft|[0.8091214496699353,0.02988...|  0.0|       0.0|
+------------------------------+-------------+------------------------------+-----+----------+
only showing top 5 rows



                                                                                

Training Results

In [24]:
evaluator_cv_lr = MulticlassClassificationEvaluator().setPredictionCol("prediction").evaluate(train_results)
print(' ')
print('------------------------------Accuracy----------------------------------')
print(' ')
print('                       accuracy:{}:'.format(evaluator_cv_lr))

[Stage 6707:=====>                                                 (1 + 9) / 10]

 
------------------------------Accuracy----------------------------------
 
                       accuracy:0.9344596882969053:


                                                                                

Test Results

In [25]:
evaluator_cv = MulticlassClassificationEvaluator().setPredictionCol("prediction").evaluate(test_results)
print(' ')
print('------------------------------Accuracy----------------------------------')
print(' ')
print('                       accuracy:{}:'.format(evaluator_cv))

[Stage 6709:=====>                                                 (1 + 9) / 10]

 
------------------------------Accuracy----------------------------------
 
                       accuracy:0.9343702864245909:


                                                                                