# IST718 Big Data Final Project - SF_crime
##### Project1 - Chiau Yin Yang, Qing Chen, Zilong Chen

We ran our models on databricks and download the python notebook for submission

In [2]:
# import package
from csv import reader
from pyspark.sql import SparkSession
import pyspark

spark = SparkSession.builder.getOrCreate()
sc = spark.sparkContext

from pyspark.ml import feature, regression, Pipeline
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler
from pyspark.sql import functions as fn, Row
from pyspark.sql.types import *
from pyspark import sql
from pyspark.ml.classification import LogisticRegression, RandomForestClassifier
from pyspark.ml.classification import GBTClassifier

import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
import warnings

import os

## Convert Data to Spark DataFrame

In [4]:
# Load dataset
from pyspark.sql import SparkSession
spark = SparkSession \
    .builder \
    .appName("crime analysis") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()

df = spark.read.format("csv").option("header", "true").load("FileStore/tables/Police_Department_Incident_Reports__Historical_2003_to_May_2018.csv")


In [5]:
# check files path
%fs ls FileStore/tables

path,name,size
dbfs:/FileStore/tables/Police_Department_Incident_Reports__Historical_2003_to_May_2018.csv,Police_Department_Incident_Reports__Historical_2003_to_May_2018.csv,300601428
dbfs:/FileStore/tables/crime_type_predefined-1fa1e.csv,crime_type_predefined-1fa1e.csv,615


In [6]:
# view dataframe
df.show(5)
print(df.count())

In [7]:
# data-processing: drop 1 NA
df1 = df.na.drop()
df1.count()

In [8]:
from pyspark.sql.types import DateType
from pyspark.sql.functions import year, month, dayofmonth
from pyspark.sql.functions import udf
from datetime import datetime
# Convert Date type from string to unix_timestamp
func = udf (lambda x: datetime.strptime(x, '%m/%d/%Y'), DateType())
# Add 'Year', 'Month', 'Day' columns to the dataframe
df_with_timestamp = df1.withColumn('Spark_Date', func(df.Date))
df_with_timestamp = df_with_timestamp.withColumn('Year', year(df_with_timestamp.Spark_Date)).withColumn('Month', month(df_with_timestamp.Spark_Date))
df_with_timestamp = df_with_timestamp.withColumn('Day', dayofmonth(df_with_timestamp.Spark_Date))


In [9]:
df_with_timestamp.show(5)

### Quick analysis for crime incidents by time 
- rank of crime by year

In [11]:
df_crimes_2003_to_2018_yearly = df_with_timestamp.groupBy('Year').count().orderBy('count', ascending=False)
df_crimes_2003_to_2018_yearly.cache()

In [12]:
df_crimes_2003_to_2018_yearly.show(10)

In [13]:
Crime_by_month = df_with_timestamp.groupBy('Month').count().orderBy('count', ascending=False)
Crime_by_month.show()

In [14]:
Crime_by_month2 = df_with_timestamp.groupBy('Month').count().orderBy('Month', ascending=True)
Crime_by_month2.show()

In [15]:
# Convert the Date and Time from String to Integer
from pyspark.sql.types import TimestampType
from pyspark.sql.functions import year, month, dayofmonth, hour, minute
from datetime import datetime
# Convert Time type from string to timestamp
func_timestamp = udf (lambda x: datetime.strptime(x, '%H:%M'), TimestampType())
# Add 'Year', 'Month', 'Day' columns to the dataframe
df_with_timestamp = df_with_timestamp.withColumn('Spark_Time', func_timestamp(df_with_timestamp.Time))
df_with_timestamp = df_with_timestamp.withColumn('Hour', hour(df_with_timestamp.Spark_Time)).withColumn('Minute', minute(df_with_timestamp.Spark_Time))

In [16]:
show_timestamp = df_with_timestamp.select('IncidntNum', 'Category', 'Year', 'Month', 'Day', 'Hour', 'Minute')
show_timestamp.show(5)

In [17]:
show_timestamp2=df_with_timestamp.select('Category', 'PdDistrict', 'Year', 'Month', 'Day', 'Hour','Resolution')
show_timestamp2.show(10)

In [18]:
# covert target attribute - resolution to numeric
df_with_timestamp = df_with_timestamp.withColumn(
    'Res_num', fn.when((fn.col('Resolution') == 'NONE') | (fn.col('Resolution') == 'UNFOUNDED'), 0).otherwise(1))

In [19]:
df_with_timestamp.groupby('Res_num').count().show()

## Feature Engineering
##### Create labeled dummy variables for categorical columns
- Location (PD District)
- Day of week
- Crime category (broken into 5 levels by the degree of the crime, defined by us)
- Incident hour (broken into 5 levels to even out the number of incidents)
- Month as 12 levels

In [20]:
# location
df_with_timestamp = df_with_timestamp.withColumn('SOUTHERN',(fn.col('PdDistrict')== 'SOUTHERN').cast('int'))
df_with_timestamp = df_with_timestamp.withColumn('MISSION',(fn.col('PdDistrict')== 'MISSION').cast('int'))
df_with_timestamp = df_with_timestamp.withColumn('NORTHERN',(fn.col('PdDistrict')== 'NORTHERN').cast('int'))
df_with_timestamp = df_with_timestamp.withColumn('CENTRAL',(fn.col('PdDistrict')== 'CENTRAL').cast('int'))
df_with_timestamp = df_with_timestamp.withColumn('BAYVIEW',(fn.col('PdDistrict')== 'BAYVIEW').cast('int'))
df_with_timestamp = df_with_timestamp.withColumn('TENDERLOIN',(fn.col('PdDistrict')== 'TENDERLOIN').cast('int'))
df_with_timestamp = df_with_timestamp.withColumn('INGLESIDE',(fn.col('PdDistrict')== 'INGLESIDE').cast('int'))
df_with_timestamp = df_with_timestamp.withColumn('TARAVAL',(fn.col('PdDistrict')== 'TARAVAL').cast('int'))
df_with_timestamp = df_with_timestamp.withColumn('PARK',(fn.col('PdDistrict')== 'PARK').cast('int'))
#df_with_timestamp = df_with_timestamp.withColumn('RICHMOND',(fn.col('PdDistrict')== 'RICHMOND').cast('int'))
# use Richmond as reference for inference

In [21]:
df_with_timestamp.show(5)

In [22]:
# Load the pre-defined crime level dataset
df_crime_type = spark.read.format("csv").option("header", "true").load("FileStore/tables/crime_type_predefined-1fa1e.csv")

In [23]:
#df_crime_type.drop(index=0, inplace = True)
df_crime_type.head(3)

In [24]:
import pyspark.sql
from pyspark.sql import functions as fn


In [25]:
#df_with_timestamp.printSchema()

In [26]:
# only save necessary columns to save some spaces
df_final = df_with_timestamp.select('Category', 'Year', 'Month', 'Hour','Res_num',
                            'SOUTHERN','MISSION','NORTHERN','CENTRAL','BAYVIEW',
                            'TENDERLOIN','INGLESIDE','TARAVAL','PARK','DayOfWeek')
df_final.show(5)

In [27]:
#create dummy variable for day of the week - use wednesday as our reference 
df_final = df_final.withColumn('Sunday', fn.when(fn.col('DayOfWeek') == 'Sunday', 1).otherwise(0))
df_final = df_final.withColumn('Monday', fn.when(fn.col('DayOfWeek') == 'Monday', 1).otherwise(0))
df_final = df_final.withColumn('Tuesday', fn.when(fn.col('DayOfWeek') == 'Tuesday', 1).otherwise(0))
df_final = df_final.withColumn('Thursday', fn.when(fn.col('DayOfWeek') == 'Thursday', 1).otherwise(0))
df_final = df_final.withColumn('Friday', fn.when(fn.col('DayOfWeek') == 'Friday', 1).otherwise(0))
df_final = df_final.withColumn('Saturday', fn.when(fn.col('DayOfWeek') == 'Saturday', 1).otherwise(0))

df_final.show(3)

In [28]:
# morning as reference
df_final = df_final.withColumn('Morning', fn.when((fn.col('Hour') == 1) | (fn.col('Hour') == 2) |
                                                  (fn.col('Hour') == 3) |(fn.col('Hour') == 4) |
                                                  (fn.col('Hour') == 5) |(fn.col('Hour') == 6) |
                                                  (fn.col('Hour') == 7) | (fn.col('Hour') == 8), 1).otherwise(0))


In [29]:
#create dummy variable for day of the week - use wednesday as our reference
df_final = df_final.withColumn('Near noon', fn.when((fn.col('Hour') == 9) | (fn.col('Hour') == 10) |
                                              (fn.col('Hour') == 11) | (fn.col('Hour') == 12), 1).otherwise(0))
df_final = df_final.withColumn('Afternoon', fn.when((fn.col('Hour') == 13) | (fn.col('Hour') == 14) |
                                            (fn.col('Hour') == 15) | (fn.col('Hour') == 16) |
                                             (fn.col('Hour') == 17), 1).otherwise(0))
df_final = df_final.withColumn('evening', fn.when((fn.col('Hour') >= 18) & (fn.col('Hour') <= 20), 1).otherwise(0))
df_final = df_final.withColumn('Night', fn.when((fn.col('Hour') == 21) | (fn.col('Hour') == 22) |
                                                 (fn.col('Hour') == 23) | (fn.col('Hour') == 0), 1).otherwise(0))

df_final.show(3)
#training_df = df_final.where('Year >= 2008 and Year < 2014')                       

In [30]:
# dropping morning cuz it is baseline reference
df_final = df_final.drop('Morning')
df_final.show(2)

In [31]:
# dropping columns that we dont need for modeling
df_final = df_final.drop('Hour')
df_final = df_final.drop('DayOfWeek')
df_final.show(2)

In [32]:
# checking the size of the dataframe
print((df_final.count(), len(df_final.columns)))

In [33]:
df_crime_type = df_crime_type.selectExpr("crime_type as Category", "level as Level")
df_crime_type.show(2)

In [34]:

crime_join = df_final.join(df_crime_type, on='Category',how='left')
# Could also use 'right_outer'
crime_join.show(3)


In [35]:
#df_final = df_final.withColumn('Very Minor', fn.when(fn.col('Level') == '1', 1).otherwise(0)) 
# as our reference 
crime_join = crime_join.withColumn('Minor', fn.when(fn.col('Level') == '2', 1).otherwise(0))
crime_join = crime_join.withColumn('Medium', fn.when(fn.col('Level') == '3', 1).otherwise(0))
crime_join = crime_join.withColumn('Servere', fn.when(fn.col('Level') == '4', 1).otherwise(0))
crime_join = crime_join.withColumn('Very Servere', fn.when(fn.col('Level') == '5', 1).otherwise(0))

crime_join.show(3)

In [36]:
crime_join.groupby('Level').count().show()
# why is there missing values?? yay it is all fixed!!

In [37]:
df_final = crime_join.drop('Level')
df_final = df_final.drop('Category')
df_final.show()

In [38]:
# create month category - dec as reference
df_final = df_final.withColumn('Jan', fn.when(fn.col('Month') == '1', 1).otherwise(0))
df_final = df_final.withColumn('Feb', fn.when(fn.col('Month') == '2', 1).otherwise(0))
df_final = df_final.withColumn('Mar', fn.when(fn.col('Month') == '3', 1).otherwise(0))
df_final = df_final.withColumn('Apr', fn.when(fn.col('Month') == '4', 1).otherwise(0))
df_final = df_final.withColumn('May', fn.when(fn.col('Month') == '5', 1).otherwise(0))
df_final = df_final.withColumn('Jun', fn.when(fn.col('Month') == '6', 1).otherwise(0))
df_final = df_final.withColumn('Jul', fn.when(fn.col('Month') == '7', 1).otherwise(0))
df_final = df_final.withColumn('Aug', fn.when(fn.col('Month') == '8', 1).otherwise(0))
df_final = df_final.withColumn('Sep', fn.when(fn.col('Month') == '9', 1).otherwise(0))
df_final = df_final.withColumn('Oct', fn.when(fn.col('Month') == '10', 1).otherwise(0))
df_final = df_final.withColumn('Nov', fn.when(fn.col('Month') == '11', 1).otherwise(0))
df_final.show(3)

In [39]:
# create year category - 2008 as reference
df_final = df_final.withColumn('Year_09', fn.when(fn.col('Year') == '2009', 1).otherwise(0))
df_final = df_final.withColumn('Year_10', fn.when(fn.col('Year') == '2010', 1).otherwise(0))
df_final = df_final.withColumn('Year_11', fn.when(fn.col('Year') == '2011', 1).otherwise(0))
df_final = df_final.withColumn('Year_12', fn.when(fn.col('Year') == '2012', 1).otherwise(0))
df_final = df_final.withColumn('Year_13', fn.when(fn.col('Year') == '2013', 1).otherwise(0))
df_final = df_final.withColumn('Year_14', fn.when(fn.col('Month') == '6', 1).otherwise(0))
df_final = df_final.withColumn('Year_15', fn.when(fn.col('Month') == '7', 1).otherwise(0))
df_final = df_final.withColumn('Year_16', fn.when(fn.col('Month') == '8', 1).otherwise(0))
df_final = df_final.withColumn('Year_17', fn.when(fn.col('Month') == '9', 1).otherwise(0))
df_final = df_final.withColumn('Month', fn.when(fn.col('Month') == '10', 1).otherwise(0))
df_final = df_final.withColumn('Month', fn.when(fn.col('Month') == '11', 1).otherwise(0))
df_final.show(3)

## Split dataset into training, validation and testing by the year of the incidents
- Follow 0.6, 0.3, and 0.1 rule to split (6 years of data as training etc)

In [40]:
# or split by year
training_df = df_final.where('Year >= 2008 and Year < 2014')
validation_df = df_final.where('Year >= 2014 and Year < 2017')
testing_df = df_final.where('Year >= 2017')

In [41]:
print("# points in training: ", training_df.count())
print("# points in validation: ", validation_df.count())
print("# points in testing: ", testing_df.count())

In [42]:
testing_df.groupby('Res_num').count().show()
# points in training: 841044 # points in validation: 457536 # points in testing: 154773

# Modeling - run with different variables
- Random Forest (refere as rf)
- Logistic regression (refer as lr)
- Gradient boosting (refer as gb)

model# differentiates the number of variables

#### Variables that are considered:
- location
- Crime category
- Incident time
    - Hour (5 levels)
    - Month (12 levels)
    - Day of the week (7 levels)
    
Parameters used in model if any:
1. (RF) number of trees (10, 20, 30, 50, 80, 100)
2. (RF) cachenodeID
3. (GB, RF) seed = 0 for same result
4. (GB, LR) maxIter=10
5. (LR) regParam, elasticNetParam

In [43]:
# Functionality for computing features
from pyspark.ml import feature
# Functionality for regression
from pyspark.ml import regression
# Funcionality for classification
from pyspark.ml import classification
# Object for creating sequences of transformations
from pyspark.ml import Pipeline, evaluation

In [44]:
from pyspark.mllib.classification import LogisticRegressionWithLBFGS
from pyspark.mllib.util import MLUtils
from pyspark.mllib.evaluation import MulticlassMetrics

In [45]:
from pyspark.sql.functions import isnan
from pyspark.ml.feature import OneHotEncoderEstimator
from pyspark.ml.feature import StringIndexer

In [46]:
# category only
crime_m1 = Pipeline(stages=[feature.VectorAssembler(inputCols=['Minor', 'Medium',
                                                               'Servere', 'Very Servere'],
                                        outputCol='features'),
                 classification.LogisticRegression(labelCol='Res_num', featuresCol='features', maxIter=10)])
crime_m1_fitted = crime_m1.fit(training_df)
transformer_m1 = crime_m1_fitted.transform(validation_df)
evaluator = evaluation.BinaryClassificationEvaluator(labelCol='Res_num')
lr_AUC1 = evaluator.evaluate(transformer_m1)
#regParam=0, elasticNetParam=0
print('lr_m1 = ',lr_AUC1)

In [47]:
# category only
gb_m1 = Pipeline(stages=[feature.VectorAssembler(inputCols=['Minor', 'Medium',
                                                               'Servere', 'Very Servere'],
                                        outputCol='features'),
                 classification.GBTClassifier(labelCol='Res_num', featuresCol='features', maxIter=10)])
gb_m1_fitted = gb_m1.fit(training_df)
gb_transformer_m1 = gb_m1_fitted.transform(validation_df)
evaluator = evaluation.BinaryClassificationEvaluator(labelCol='Res_num')
gb_AUC1 = evaluator.evaluate(gb_transformer_m1)
#regParam=0, elasticNetParam=0
print('gb_m1 = ',gb_AUC1)

In [48]:
# category only
rf_m1 = Pipeline(stages=[feature.VectorAssembler(inputCols=['Minor', 'Medium',
                                                               'Servere', 'Very Servere'],
                                        outputCol='features'),
                classification.RandomForestClassifier(labelCol='Res_num', featuresCol='features', seed = 0,
                            numTrees=10, cacheNodeIds = True)])
rf_m1_fitted = rf_m1.fit(training_df)
rf_m1_trans = rf_m1_fitted.transform(validation_df)
evaluator = evaluation.BinaryClassificationEvaluator(labelCol='Res_num')
rf_AUC1 = evaluator.evaluate(rf_m1_trans)
print('rf_m1 = ',rf_AUC1)

In [49]:
# crime type + location
crime_m2 = Pipeline(stages=[feature.VectorAssembler(inputCols=['Minor', 'Medium', 'Servere',
                    'Very Servere','SOUTHERN','MISSION','NORTHERN','CENTRAL','BAYVIEW','TENDERLOIN',
                    'INGLESIDE','TARAVAL','PARK'],
                                        outputCol='features'),
                 classification.LogisticRegression(labelCol='Res_num', featuresCol='features', maxIter=10)])
crime_m2_fitted = crime_m2.fit(training_df)
transformer_m2 = crime_m2_fitted.transform(validation_df)
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(labelCol="Res_num")
print('lr_AUC2 = ', evaluator.evaluate(transformer_m2))

In [50]:
# crime type + location
gb_m2 = Pipeline(stages=[feature.VectorAssembler(inputCols=['Minor', 'Medium', 'Servere',
                    'Very Servere','SOUTHERN','MISSION','NORTHERN','CENTRAL','BAYVIEW','TENDERLOIN',
                    'INGLESIDE','TARAVAL','PARK'], outputCol='features'),
                 classification.GBTClassifier(labelCol='Res_num', featuresCol='features', maxIter=10)])
gb_m2_fitted = gb_m2.fit(training_df)
gb_transformer_m2 = gb_m2_fitted.transform(validation_df)
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(labelCol="Res_num")
print('gb_AUC2 = ', evaluator.evaluate(gb_transformer_m2))

In [51]:
rf_m2 = Pipeline(stages=[feature.VectorAssembler(inputCols=['Minor', 'Medium', 'Servere',
                    'Very Servere','SOUTHERN','MISSION','NORTHERN','CENTRAL','BAYVIEW','TENDERLOIN',
                    'INGLESIDE','TARAVAL','PARK'], outputCol='features'),
            classification.RandomForestClassifier(labelCol='Res_num', featuresCol='features', seed = 0,
                            numTrees=10, cacheNodeIds = True)])
rf_m2_fitted = rf_m2.fit(training_df)
rf_m2_trans = rf_m2_fitted.transform(validation_df)
evaluator = evaluation.BinaryClassificationEvaluator(labelCol='Res_num')
rf_AUC2 = evaluator.evaluate(rf_m2_trans)
print('rf_m2 = ',rf_AUC2)

In [52]:
# crime level + year + month + location + hour level --> updated: removed year
crime_m9 = Pipeline(stages=[feature.VectorAssembler(inputCols=['Minor', 'Medium', 'Servere',
                    'Very Servere','Jan','Feb','Mar','Apr','May','Jun','Jul','Aug', 'Sep','Oct','Nov','SOUTHERN','MISSION','NORTHERN','CENTRAL',
                    'BAYVIEW','TENDERLOIN', 'INGLESIDE','TARAVAL','PARK','Near noon','Afternoon',
                    'evening','Night'], outputCol='features'),
                 classification.LogisticRegression(labelCol='Res_num', featuresCol='features')])
crime_m9_fitted = crime_m9.fit(training_df)
# removed day of the week
transformer_m9 = crime_m9_fitted.transform(validation_df)

print('AUC9 = ', evaluator.evaluate(transformer_m9))

In [53]:

# crime level + year + month + location + hour level
gb_m9 = Pipeline(stages=[feature.VectorAssembler(inputCols=['Minor', 'Medium', 'Servere',
                    'Very Servere', 'Jan','Feb','Mar','Apr','May','Jun','Jul','Aug', 'Sep','Oct','Nov','SOUTHERN','MISSION','NORTHERN','CENTRAL',
                    'BAYVIEW','TENDERLOIN', 'INGLESIDE','TARAVAL','PARK','Near noon','Afternoon',
                    'evening','Night'], outputCol='features'),
                 classification.GBTClassifier(labelCol='Res_num', featuresCol='features',maxIter=10)])
gb_m9_fitted = gb_m9.fit(training_df)
# removed day of the week
gb_transformer_m9 = gb_m9_fitted.transform(validation_df)

print('AUC9 = ', evaluator.evaluate(gb_transformer_m9))

In [54]:
# crime level + year + month + location + hour level
rf_m9 = Pipeline(stages=[feature.VectorAssembler(inputCols=['Minor', 'Medium', 'Servere',
                    'Very Servere','Jan','Feb','Mar','Apr','May','Jun','Jul','Aug', 'Sep','Oct','Nov','SOUTHERN','MISSION','NORTHERN','CENTRAL',
                    'BAYVIEW','TENDERLOIN', 'INGLESIDE','TARAVAL','PARK','Near noon','Afternoon',
                    'evening','Night'], outputCol='features'),
         classification.RandomForestClassifier(labelCol='Res_num', featuresCol='features', seed = 0,
                            numTrees=10, cacheNodeIds = True)])
rf_m9_fitted = rf_m9.fit(training_df)
rf_m9_trans = rf_m9_fitted.transform(validation_df)
evaluator = evaluation.BinaryClassificationEvaluator(labelCol='Res_num')
rf_AUC9 = evaluator.evaluate(rf_m9_trans)
print('rf_m9 = ',rf_AUC9)

In [55]:
# crime level + year + month + day + hour
crime_m3 = Pipeline(stages=[feature.VectorAssembler(inputCols=['Minor', 'Medium', 'Servere',
                    'Very Servere','Jan','Feb','Mar','Apr','May','Jun','Jul','Aug', 'Sep','Oct','Nov','Sunday','Monday','Tuesday','Thursday','Friday','Saturday',
                    'Near noon','Afternoon','evening','Night'],
                                        outputCol='features'),
                 classification.LogisticRegression(labelCol='Res_num', featuresCol='features', maxIter=10)])
crime_m3_fitted = crime_m3.fit(training_df)
transformer_m3 = crime_m3_fitted.transform(validation_df)
print('lr_AUC3 = ', evaluator.evaluate(transformer_m3))

In [56]:
#GBTClassifier
# crime level + year + month + day + hour
gb_m3 = Pipeline(stages=[feature.VectorAssembler(inputCols=['Minor', 'Medium', 'Servere',
                    'Very Servere','Year','Month','Sunday','Monday','Tuesday','Thursday','Friday','Saturday',
                    'Near noon','Afternoon','evening','Night'],
                                        outputCol='features'),
                 classification.GBTClassifier(labelCol='Res_num', featuresCol='features', maxIter=10)])
gb_m3_fitted = gb_m3.fit(training_df)
gb_transformer_m3 = gb_m3_fitted.transform(validation_df)
print('lr_AUC3 = ', evaluator.evaluate(gb_transformer_m3))

In [57]:
# crime level + year + month + day + hour
rf_m3 = Pipeline(stages=[feature.VectorAssembler(inputCols=['Minor', 'Medium', 'Servere',
                    'Very Servere','Jan','Feb','Mar','Apr','May','Jun','Jul','Aug', 'Sep','Oct','Nov','Sunday','Monday','Tuesday','Thursday','Friday','Saturday',
                    'Near noon','Afternoon','evening','Night'],
                                        outputCol='features'),
            classification.RandomForestClassifier(labelCol='Res_num', featuresCol='features', seed = 0,
                            numTrees=10, cacheNodeIds = True)])
rf_m3_fitted = rf_m3.fit(training_df)
rf_m3_trans = rf_m3_fitted.transform(validation_df)
evaluator = evaluation.BinaryClassificationEvaluator(labelCol='Res_num')
rf_AUC3 = evaluator.evaluate(rf_m3_trans)
print('rf_m3 = ',rf_AUC3)

In [58]:
# crime level + day, year month, hour, location = all
crime_m4 = Pipeline(stages=[feature.VectorAssembler(inputCols=['Minor', 'Medium', 'Servere',
                    'Very Servere','Jan','Feb','Mar','Apr','May','Jun','Jul','Aug', 'Sep','Oct','Nov','Sunday','Monday','Tuesday','Thursday','Friday','Saturday',
                    'Near noon','Afternoon','evening','Night','SOUTHERN','MISSION','NORTHERN','CENTRAL',
                    'BAYVIEW','TENDERLOIN', 'INGLESIDE','TARAVAL','PARK'],
                                        outputCol='features'),
                 classification.LogisticRegression(labelCol='Res_num', featuresCol='features', maxIter=10)])
crime_m4_fitted = crime_m4.fit(training_df)
transformer_m4 = crime_m4_fitted.transform(validation_df)
print('lr_AUC4 = ', evaluator.evaluate(transformer_m4))

In [59]:

# crime level + day, year month, hour, location = all
gb_m4 = Pipeline(stages=[feature.VectorAssembler(inputCols=['Minor', 'Medium', 'Servere',
                    'Very Servere','Jan','Feb','Mar','Apr','May','Jun','Jul','Aug', 'Sep','Oct','Nov','Sunday','Monday','Tuesday','Thursday','Friday','Saturday',
                    'Near noon','Afternoon','evening','Night','SOUTHERN','MISSION','NORTHERN','CENTRAL',
                    'BAYVIEW','TENDERLOIN', 'INGLESIDE','TARAVAL','PARK'],
                                        outputCol='features'),
                 classification.GBTClassifier(labelCol='Res_num', featuresCol='features', maxIter=10)])
gb_m4_fitted = gb_m4.fit(training_df)
gb_transformer_m4 = gb_m4_fitted.transform(validation_df)
print('lr_AUC4 = ', evaluator.evaluate(gb_transformer_m4))

In [60]:
# crime level + day, year month, hour, location = all
rf_m4 = Pipeline(stages=[feature.VectorAssembler(inputCols=['Minor', 'Medium', 'Servere',
                    'Very Servere','Jan','Feb','Mar','Apr','May','Jun','Jul','Aug', 'Sep','Oct','Nov','Sunday','Monday','Tuesday','Thursday','Friday','Saturday',
                    'Near noon','Afternoon','evening','Night','SOUTHERN','MISSION','NORTHERN','CENTRAL',
                    'BAYVIEW','TENDERLOIN', 'INGLESIDE','TARAVAL','PARK'],
                                        outputCol='features'),
            classification.RandomForestClassifier(labelCol='Res_num', featuresCol='features', seed = 0,
                            numTrees=10, cacheNodeIds = True)])
rf_m4_fitted = rf_m4.fit(training_df)
rf_m4_trans = rf_m4_fitted.transform(validation_df)
evaluator = evaluation.BinaryClassificationEvaluator(labelCol='Res_num')
rf_AUC4 = evaluator.evaluate(rf_m4_trans)
print('rf_m4 = ',rf_AUC4)

In [61]:
rf_m21 = Pipeline(stages=[feature.VectorAssembler(inputCols=['Minor', 'Medium', 'Servere',
                    'Very Servere','SOUTHERN','MISSION','NORTHERN','CENTRAL','BAYVIEW','TENDERLOIN',
                    'INGLESIDE','TARAVAL','PARK'], outputCol='features'),
            classification.RandomForestClassifier(labelCol='Res_num', featuresCol='features', seed = 0,
                            numTrees=50, cacheNodeIds = True)])
rf_m21_fitted = rf_m21.fit(training_df)
rf_m21_trans = rf_m21_fitted.transform(validation_df)
evaluator = evaluation.BinaryClassificationEvaluator(labelCol='Res_num')
rf_AUC21 = evaluator.evaluate(rf_m21_trans)
print('rf_m21 = ',rf_AUC21)

In [62]:
rf_m22 = Pipeline(stages=[feature.VectorAssembler(inputCols=['Minor', 'Medium', 'Servere',
                    'Very Servere','SOUTHERN','MISSION','NORTHERN','CENTRAL','BAYVIEW','TENDERLOIN',
                    'INGLESIDE','TARAVAL','PARK'], outputCol='features'),
            classification.RandomForestClassifier(labelCol='Res_num', featuresCol='features', seed = 0,
                            numTrees=30, cacheNodeIds = True)])
rf_m22_fitted = rf_m22.fit(training_df)
rf_m22_trans = rf_m22_fitted.transform(validation_df)
evaluator = evaluation.BinaryClassificationEvaluator(labelCol='Res_num')
rf_AUC22 = evaluator.evaluate(rf_m22_trans)
print('rf_m22 = ',rf_AUC22)

In [63]:
rf_m23 = Pipeline(stages=[feature.VectorAssembler(inputCols=['Minor', 'Medium', 'Servere',
                    'Very Servere','SOUTHERN','MISSION','NORTHERN','CENTRAL','BAYVIEW','TENDERLOIN',
                    'INGLESIDE','TARAVAL','PARK'], outputCol='features'),
            classification.RandomForestClassifier(labelCol='Res_num', featuresCol='features', seed = 0,
                            numTrees=40, cacheNodeIds = True)])
rf_m23_fitted = rf_m23.fit(training_df)
rf_m23_trans = rf_m23_fitted.transform(validation_df)
evaluator = evaluation.BinaryClassificationEvaluator(labelCol='Res_num')
rf_AUC23 = evaluator.evaluate(rf_m23_trans)
print('rf_m23 = ',rf_AUC23)

In [64]:
rf_m24 = Pipeline(stages=[feature.VectorAssembler(inputCols=['Minor', 'Medium', 'Servere',
                    'Very Servere','SOUTHERN','MISSION','NORTHERN','CENTRAL','BAYVIEW','TENDERLOIN',
                    'INGLESIDE','TARAVAL','PARK'], outputCol='features'),
            classification.RandomForestClassifier(labelCol='Res_num', featuresCol='features', seed = 0,
                            numTrees=80, cacheNodeIds = True)])
rf_m24_fitted = rf_m24.fit(training_df)
rf_m24_trans = rf_m24_fitted.transform(validation_df)
evaluator = evaluation.BinaryClassificationEvaluator(labelCol='Res_num')
rf_AUC24 = evaluator.evaluate(rf_m24_trans)
print('rf_m24 = ',rf_AUC24)

In [65]:
rf_m25 = Pipeline(stages=[feature.VectorAssembler(inputCols=['Minor', 'Medium', 'Servere',
                    'Very Servere','SOUTHERN','MISSION','NORTHERN','CENTRAL','BAYVIEW','TENDERLOIN',
                    'INGLESIDE','TARAVAL','PARK'], outputCol='features'),
            classification.RandomForestClassifier(labelCol='Res_num', featuresCol='features', seed = 0,
                            numTrees=100, cacheNodeIds = True)])
rf_m25_fitted = rf_m25.fit(training_df)
rf_m25_trans = rf_m25_fitted.transform(validation_df)
evaluator = evaluation.BinaryClassificationEvaluator(labelCol='Res_num')
rf_AUC25 = evaluator.evaluate(rf_m25_trans)
print('rf_m25 = ',rf_AUC25)

In [66]:
# just pick the highest AUC 
# crime level + year + month + location + hour level
crime_m9_1 = Pipeline(stages=[feature.VectorAssembler(inputCols=['Minor', 'Medium', 'Servere',
                    'Very Servere','Jan','Feb','Mar','Apr','May','Jun','Jul','Aug', 'Sep','Oct','Nov','SOUTHERN','MISSION','NORTHERN','CENTRAL',
                    'BAYVIEW','TENDERLOIN', 'INGLESIDE','TARAVAL','PARK','Near noon','Afternoon',
                    'evening','Night'], outputCol='features'),
                 classification.LogisticRegression(labelCol='Res_num', featuresCol='features', maxIter=10,
                                                  regParam=0.3, elasticNetParam=0.8)])
crime_m9_1_fitted = crime_m9_1.fit(training_df)


In [67]:
# crime level + day, year month, hour, location = all
crime_m4_1 = Pipeline(stages=[feature.VectorAssembler(inputCols=['Minor', 'Medium', 'Servere',
                    'Very Servere','Jan','Feb','Mar','Apr','May','Jun','Jul','Aug', 'Sep','Oct','Nov','Sunday','Monday','Tuesday','Thursday','Friday','Saturday',
                    'Near noon','Afternoon','evening','Night','SOUTHERN','MISSION','NORTHERN','CENTRAL',
                    'BAYVIEW','TENDERLOIN', 'INGLESIDE','TARAVAL','PARK'],
                                        outputCol='features'),
                 classification.LogisticRegression(labelCol='Res_num', featuresCol='features', maxIter=10,
                                                  regParam=0.3, elasticNetParam=0.8)])
crime_m4_1_fitted = crime_m4_1.fit(training_df)

In [68]:
AUC9_1 = evaluator.evaluate(crime_m9_1_fitted.transform(validation_df))
AUC4_1 = evaluator.evaluate(crime_m4_1_fitted.transform(validation_df))

print("Model 4 - ALL + parameters AUC: ", AUC4_1)
print("Model 9_1 - category + time + parameters AUC: ", AUC9_1)
print("Model 4 - ALL AUC: ", evaluator.evaluate(transformer_m4))
print("Model 9 - category + location + time AUC: ", evaluator.evaluate(transformer_m9))


In [69]:
# crime level + day, year month, hour, location = all
crime_m4_2 = Pipeline(stages=[feature.VectorAssembler(inputCols=['Minor', 'Medium', 'Servere',
                    'Very Servere','Year','Month','Sunday','Monday','Tuesday','Thursday','Friday','Saturday',
                    'Near noon','Afternoon','evening','Night','SOUTHERN','MISSION','NORTHERN','CENTRAL',
                    'BAYVIEW','TENDERLOIN', 'INGLESIDE','TARAVAL','PARK'],
                                        outputCol='features'),
                 classification.LogisticRegression(labelCol='Res_num', featuresCol='features', maxIter=10,
                                                  regParam=0.1, elasticNetParam=0.4)])
crime_m4_2_fitted = crime_m4_2.fit(training_df)

In [70]:
# just pick the highest AUC 
# crime level + year + month + location + hour level
crime_m9_2 = Pipeline(stages=[feature.VectorAssembler(inputCols=['Minor', 'Medium', 'Servere',
                    'Very Servere','Year','Month','SOUTHERN','MISSION','NORTHERN','CENTRAL',
                    'BAYVIEW','TENDERLOIN', 'INGLESIDE','TARAVAL','PARK','Near noon','Afternoon',
                    'evening','Night'], outputCol='features'),
                 classification.LogisticRegression(labelCol='Res_num', featuresCol='features', maxIter=10,
                                                  regParam=0.1, elasticNetParam=0.4)])
crime_m9_2_fitted = crime_m9_2.fit(training_df)


In [71]:
AUC9_2 = evaluator.evaluate(crime_m9_2_fitted.transform(validation_df))
AUC4_2 = evaluator.evaluate(crime_m4_2_fitted.transform(validation_df))

print("Model 4_2 - ALL + parameters AUC: ", AUC4_2)
print("Model 9_2 - category + time + parameters AUC: ", AUC9_2)

In [72]:
# crime level + day, year month, hour, location = all
crime_m4_3 = Pipeline(stages=[feature.VectorAssembler(inputCols=['Minor', 'Medium', 'Servere',
                    'Very Servere','Year','Month','Sunday','Monday','Tuesday','Thursday','Friday','Saturday',
                    'Near noon','Afternoon','evening','Night','SOUTHERN','MISSION','NORTHERN','CENTRAL',
                    'BAYVIEW','TENDERLOIN', 'INGLESIDE','TARAVAL','PARK'],
                                        outputCol='features'),
                 classification.LogisticRegression(labelCol='Res_num', featuresCol='features', maxIter=10,
                                                  regParam=0.02, elasticNetParam=0.2)])
crime_m4_3_fitted = crime_m4_3.fit(training_df)

In [73]:
# just pick the highest AUC 
# crime level + year + month + location + hour level
crime_m9_3 = Pipeline(stages=[feature.VectorAssembler(inputCols=['Minor', 'Medium', 'Servere',
                    'Very Servere','Year','Month','SOUTHERN','MISSION','NORTHERN','CENTRAL',
                    'BAYVIEW','TENDERLOIN', 'INGLESIDE','TARAVAL','PARK','Near noon','Afternoon',
                    'evening','Night'], outputCol='features'),
                 classification.LogisticRegression(labelCol='Res_num', featuresCol='features', maxIter=10,
                                                  regParam=0.02, elasticNetParam=0.2)])
crime_m9_3_fitted = crime_m9_3.fit(training_df)


In [74]:
AUC9_3 = evaluator.evaluate(crime_m9_3_fitted.transform(validation_df))
AUC4_3 = evaluator.evaluate(crime_m4_3_fitted.transform(validation_df))

print("Model 4_3 - ALL + parameters AUC: ", AUC4_3)
print("Model 9_3 - category + time + parameters AUC: ", AUC9_3)

In [75]:
# crime level + day, year month, hour, location = all
crime_m4_4 = Pipeline(stages=[feature.VectorAssembler(inputCols=['Minor', 'Medium', 'Servere',
                    'Very Servere','Year','Month','Sunday','Monday','Tuesday','Thursday','Friday','Saturday',
                    'Near noon','Afternoon','evening','Night','SOUTHERN','MISSION','NORTHERN','CENTRAL',
                    'BAYVIEW','TENDERLOIN', 'INGLESIDE','TARAVAL','PARK'],
                                        outputCol='features'),
                 classification.LogisticRegression(labelCol='Res_num', featuresCol='features', maxIter=10,
                                                  regParam=0.01, elasticNetParam=0.1)])
crime_m4_4_fitted = crime_m4_4.fit(training_df)

In [76]:
# just pick the highest AUC 
# crime level + year + month + location + hour level
crime_m9_4 = Pipeline(stages=[feature.VectorAssembler(inputCols=['Minor', 'Medium', 'Servere',
                    'Very Servere','Year','Month','SOUTHERN','MISSION','NORTHERN','CENTRAL',
                    'BAYVIEW','TENDERLOIN', 'INGLESIDE','TARAVAL','PARK','Near noon','Afternoon',
                    'evening','Night'], outputCol='features'),
                 classification.LogisticRegression(labelCol='Res_num', featuresCol='features', maxIter=10,
                                                  regParam=0.01, elasticNetParam=0.1)])
crime_m9_4_fitted = crime_m9_4.fit(training_df)


In [77]:
AUC9_4 = evaluator.evaluate(crime_m9_4_fitted.transform(validation_df))
AUC4_4 = evaluator.evaluate(crime_m4_4_fitted.transform(validation_df))

print("Model 4_4 - ALL + parameters AUC: ", AUC4_4)
print("Model 9_4 - category + time + parameters AUC: ", AUC9_4)

## Evaluate the best models on testing df
- one for each model

In [78]:
# L1 = 0.02, L3 = 0.2 - m4 (all variable + parameter)
lr_AUC_best = evaluator.evaluate(crime_m9_fitted.transform(testing_df))
lr_AUC_best

In [79]:
# all variables except day of the week without parameters
rf_AUC_best = evaluator.evaluate(rf_m2_fitted.transform(testing_df))
rf_AUC_best

In [80]:
# all variables except day of the week without parameters
gb_AUC_best = evaluator.evaluate(gb_m9_fitted.transform(testing_df))
gb_AUC_best

In [81]:
crime_m9_fitted.transform(testing_df).select('Res_num', 'prediction').show(10)

In [82]:
crime_m4_3_fitted.transform(testing_df).select('Res_num', 'prediction').show(10)

In [83]:
crime_m9_fitted.transform(testing_df).select(fn.count(fn.col('prediction') == fn.col('Res_num')) / (fn.count(fn.expr('prediction = 1')))).show()


# Evaluate the performance by confusion matrixs
- calculate precision and recall for each model

In [84]:
lr_model_9 = crime_m9_fitted.transform(testing_df)
#.select((fn.col('prediction') == fn.col('Res_num'))).show()

TP = int(lr_model_9.filter((lr_model_9["Res_num"] == 1) & (lr_model_9["prediction"] == 1)).count())


In [85]:
TP_FN = int(lr_model_9.filter(lr_model_9["Res_num"] == 1).count())
TP_FP = int(lr_model_9.filter(lr_model_9["prediction"] == 1).count())

In [86]:
lr_recall = int(TP) / int(TP_FN)
lr_precision = int(TP) / int(TP_FP)

In [87]:
gb_model_9 = gb_m9_fitted.transform(testing_df)
#.select((fn.col('prediction') == fn.col('Res_num'))).show()
gb_TP = int(gb_model_9.filter((gb_model_9["Res_num"] == 1) & (gb_model_9["prediction"] == 1)).count())
gb_TP_FN = int(gb_model_9.filter(gb_model_9["Res_num"] == 1).count())
gb_TP_FP = int(gb_model_9.filter(gb_model_9["prediction"] == 1).count())

In [88]:
gb_recall = int(gb_TP) / int(gb_TP_FN)
gb_precision = int(gb_TP) / int(gb_TP_FP)

In [89]:
rf_model_2 = rf_m2_fitted.transform(testing_df)
#.select((fn.col('prediction') == fn.col('Res_num'))).show()
rf_TP = int(rf_model_2.filter((rf_model_2["Res_num"] == 1) & (rf_model_2["prediction"] == 1)).count())
rf_TP_FN = int(rf_model_2.filter(rf_model_2["Res_num"] == 1).count())
rf_TP_FP = int(rf_model_2.filter(rf_model_2["prediction"] == 1).count())

In [90]:
rf_recall = int(rf_TP) / int(rf_TP_FN)
rf_precision = int(rf_TP) / int(rf_TP_FP)

In [91]:
print('RF',rf_recall)
print('RF-P',rf_precision)
print('LR',lr_recall)
print('LR-P',lr_precision)
print('GB',gb_recall)
print('GB-P',gb_precision)


In [92]:
testing_df.count()

In [93]:
# calculating recall, the higher the recall, the lower the false negative is

crime_m9_fitted.transform(testing_df).select(fn.sum('prediction = 1'))

In [94]:
crime_m9_fitted.transform(testing_df).select(fn.avg('prediction')).show()


In [95]:
crime_m9_fitted.stages[-1].coefficients

In [96]:
crime_m9_fitted.stages[-1].intercept

In [97]:
crime_m4_fitted.stages[-1].coefficients

In [98]:
crime_m4_3_fitted.stages[-1].intercept

In [99]:
crime_m4_3_fitted.stages[-1].coefficients

### Check if standard scalar and other paramters will change the results

In [100]:
crime_m4_5 = Pipeline(stages=[feature.VectorAssembler(inputCols=['Minor', 'Medium', 'Servere',
                    'Very Servere','Jan','Feb','Mar','Apr','May','Jun','Jul','Aug', 'Sep','Oct','Nov','Sunday','Monday','Tuesday','Thursday','Friday','Saturday',
                    'Near noon','Afternoon','evening','Night','SOUTHERN','MISSION','NORTHERN','CENTRAL',
                    'BAYVIEW','TENDERLOIN', 'INGLESIDE','TARAVAL','PARK'],
                                        outputCol='features'),
                 feature.StandardScaler(inputCol='features', outputCol= 'scalar_features', withMean = True, withStd = True),
                 classification.LogisticRegression(labelCol='Res_num', featuresCol='scalar_features', maxIter=10,
                                                  regParam=0.02, elasticNetParam=0.2)])
crime_m4_5_fitted = crime_m4_5.fit(training_df)

In [101]:
#GBTClassifier
gb_m4_5 = Pipeline(stages=[feature.VectorAssembler(inputCols=['Minor', 'Medium', 'Servere',
                    'Very Servere','Jan','Feb','Mar','Apr','May','Jun','Jul','Aug', 'Sep','Oct','Nov','Sunday','Monday','Tuesday','Thursday','Friday','Saturday',
                    'Near noon','Afternoon','evening','Night','SOUTHERN','MISSION','NORTHERN','CENTRAL',
                    'BAYVIEW','TENDERLOIN', 'INGLESIDE','TARAVAL','PARK'],
                                        outputCol='features'),
                 feature.StandardScaler(inputCol='features', outputCol= 'scalar_features', withMean = True, withStd = True),
                 classification.GBTClassifier(labelCol='Res_num', featuresCol='scalar_features', maxIter=10)])
gb_m4_5_fitted = gb_m4_5.fit(training_df)

In [102]:
rf_m4_5 = Pipeline(stages=[feature.VectorAssembler(inputCols=['Minor', 'Medium', 'Servere',
                    'Very Servere','Jan','Feb','Mar','Apr','May','Jun','Jul','Aug', 'Sep','Oct','Nov','Sunday','Monday','Tuesday','Thursday','Friday','Saturday',
                    'Near noon','Afternoon','evening','Night','SOUTHERN','MISSION','NORTHERN','CENTRAL',
                    'BAYVIEW','TENDERLOIN', 'INGLESIDE','TARAVAL','PARK'],
                                        outputCol='features'),
                 feature.StandardScaler(inputCol='features', outputCol= 'scalar_features', withMean = True, withStd = True),
                 classification.RandomForestClassifier(labelCol='Res_num', featuresCol='features', seed = 0,
                            numTrees=10, cacheNodeIds = True)])
rf_m4_5_fitted = rf_m4_5.fit(training_df)

In [103]:
# crime level + year + month + location + hour level
crime_m9_5 = Pipeline(stages=[feature.VectorAssembler(inputCols=['Minor', 'Medium', 'Servere',
                    'Very Servere','Jan','Feb','Mar','Apr','May','Jun','Jul','Aug', 'Sep','Oct','Nov','SOUTHERN','MISSION','NORTHERN','CENTRAL',
                    'BAYVIEW','TENDERLOIN', 'INGLESIDE','TARAVAL','PARK','Near noon','Afternoon',
                    'evening','Night'], outputCol='features'),
                 feature.StandardScaler(inputCol='features', outputCol= 'scalar_features',withMean = True, withStd = True),
                 classification.LogisticRegression(labelCol='Res_num', featuresCol='scalar_features', maxIter=10,
                                                  regParam=0.02, elasticNetParam=0.2)])
crime_m9_5_fitted = crime_m9_5.fit(training_df)

In [104]:
# crime level + year + month + location + hour level
gb_m9_5 = Pipeline(stages=[feature.VectorAssembler(inputCols=['Minor', 'Medium', 'Servere',
                    'Very Servere','Jan','Feb','Mar','Apr','May','Jun','Jul','Aug', 'Sep','Oct','Nov','SOUTHERN','MISSION','NORTHERN','CENTRAL',
                    'BAYVIEW','TENDERLOIN', 'INGLESIDE','TARAVAL','PARK','Near noon','Afternoon',
                    'evening','Night'], outputCol='features'),
                 feature.StandardScaler(inputCol='features', outputCol= 'scalar_features',withMean = True, withStd = True),
                 classification.GBTClassifier(labelCol='Res_num', featuresCol='scalar_features', maxIter=10)])
gb_m9_5_fitted = gb_m9_5.fit(training_df)

In [105]:
# crime level + year + month + location + hour level
rf_m9_5 = Pipeline(stages=[feature.VectorAssembler(inputCols=['Minor', 'Medium', 'Servere',
                    'Very Servere','Jan','Feb','Mar','Apr','May','Jun','Jul','Aug', 'Sep','Oct','Nov','SOUTHERN','MISSION','NORTHERN','CENTRAL',
                    'BAYVIEW','TENDERLOIN', 'INGLESIDE','TARAVAL','PARK','Near noon','Afternoon',
                    'evening','Night'], outputCol='features'),
                 feature.StandardScaler(inputCol='features', outputCol= 'scalar_features',withMean = True, withStd = True),
                 classification.RandomForestClassifier(labelCol='Res_num', featuresCol='features', seed = 0,
                            numTrees=10, cacheNodeIds = True)])
rf_m9_5_fitted = rf_m9_5.fit(training_df)

In [106]:
AUC9_5 = evaluator.evaluate(crime_m9_5_fitted.transform(validation_df))
AUC4_5 = evaluator.evaluate(crime_m4_5_fitted.transform(validation_df))

print("Model 4_5 - ALL + parameters AUC: ", AUC4_5)
print("Model 9_5 - category + time + parameters AUC: ", AUC9_5)

In [107]:
gb_AUC9_5 = evaluator.evaluate(gb_m9_5_fitted.transform(validation_df))
gb_AUC4_5 = evaluator.evaluate(gb_m4_5_fitted.transform(validation_df))

print("Model 4_5 - ALL + parameters AUC: ", gb_AUC4_5)
print("Model 9_5 - category + time + parameters AUC: ", gb_AUC9_5)

In [108]:
rf_AUC9_5 = evaluator.evaluate(rf_m9_5_fitted.transform(validation_df))
rf_AUC4_5 = evaluator.evaluate(rf_m4_5_fitted.transform(validation_df))

print("rf_Model 4_5 - ALL AUC: ", rf_AUC4_5)
print("rf_Model 9_5 - category + time AUC: ", rf_AUC9_5)

In [109]:
df_final.select('Res_num').distinct().show()

In [110]:
crime_m4_6 = Pipeline(stages=[feature.VectorAssembler(inputCols=['Minor', 'Medium', 'Servere',
                    'Very Servere','Year','Month','Sunday','Monday','Tuesday','Thursday','Friday','Saturday',
                    'Near noon','Afternoon','evening','Night','SOUTHERN','MISSION','NORTHERN','CENTRAL',
                    'BAYVIEW','TENDERLOIN', 'INGLESIDE','TARAVAL','PARK'],
                                        outputCol='features'),
                 feature.StandardScaler(inputCol='features', outputCol= 'scalar_features', withMean = True, withStd = True),
                 classification.LogisticRegression(labelCol='Res_num', featuresCol='scalar_features', maxIter=10)])
crime_m4_6_fitted = crime_m4_6.fit(training_df)

In [111]:
# crime level + year + month + location + hour level
crime_m9_6 = Pipeline(stages=[feature.VectorAssembler(inputCols=['Minor', 'Medium', 'Servere',
                    'Very Servere','Year','Month','SOUTHERN','MISSION','NORTHERN','CENTRAL',
                    'BAYVIEW','TENDERLOIN', 'INGLESIDE','TARAVAL','PARK','Near noon','Afternoon',
                    'evening','Night'], outputCol='features'),
                 feature.StandardScaler(inputCol='features', outputCol= 'scalar_features',withMean = True, withStd = True),
                 classification.LogisticRegression(labelCol='Res_num', featuresCol='scalar_features', maxIter=10)])
crime_m9_6_fitted = crime_m9_6.fit(training_df)

In [112]:
AUC9_6 = evaluator.evaluate(crime_m9_6_fitted.transform(validation_df))
AUC4_6 = evaluator.evaluate(crime_m4_6_fitted.transform(validation_df))

print("Model 4_6 - ALL + parameters AUC: ", AUC4_6)
print("Model 9_6 - category + time + parameters AUC: ", AUC9_6)

In [113]:
crime_m9_6_fitted.stages[-1].intercept

### Inference
- Feature importance for random forest and gradient boosting
- coefficients for logistic regression

In [114]:
m9_coe = crime_m9_fitted.stages[-1].coefficients.tolist()


In [115]:
# 4 level + 11 month + 9 location + 4 hour = 28
m9_feature = ['Minor', 'Medium', 'Servere',
                    'Very Servere','Jan','Feb','Mar','Apr','May','Jun','Jul','Aug', 'Sep','Oct','Nov','SOUTHERN','MISSION','NORTHERN','CENTRAL',
                    'BAYVIEW','TENDERLOIN', 'INGLESIDE','TARAVAL','PARK','Near noon','Afternoon',
                    'evening','Night']


In [116]:
len(m9_coe)

In [117]:
mapping = dict(zip(m9_feature, m9_coe))
mapping

In [118]:
m9_feature_df = spark.createDataFrame(m9_feature, StringType()).toPandas()
m9_feature_df.head(3)
m9_feature_df.rename(columns={'value':'feature'},inplace=True)

In [119]:
#from pyspark.sql.types import IntegerType

# notice the parens after the type name
m9_coe_df = spark.createDataFrame(m9_coe, FloatType()).toPandas()

In [120]:
type(m9_coe_df)
m9_coe_df.rename(columns={'value':'coefficients'},inplace=True)

In [121]:
result = pd.concat([m9_feature_df, m9_coe_df], axis=1)
result['weights'] = (result['coefficients'] / 4)

In [122]:
result

In [123]:
def ExtractFeatureImp(featureImp, dataset, featuresCol):
    list_extract = []
    for i in dataset.schema[featuresCol].metadata["ml_attr"]["attrs"]:
        list_extract = list_extract + dataset.schema[featuresCol].metadata["ml_attr"]["attrs"][i]
    varlist = pd.DataFrame(list_extract)
    varlist['score'] = varlist['idx'].apply(lambda x: featureImp[x])
    return(varlist.sort_values('score', ascending = False))

In [124]:
gb_m9_fit_model = gb_m9_fitted.stages[-1]
gb_m9_trans = gb_m9_fitted.transform(validation_df)

ExtractFeatureImp(gb_m9_fit_model.featureImportances, gb_m9_trans, "features").head(20)

In [125]:
rf_m2_fit_model = rf_m2_fitted.stages[-1]
m2_trans = rf_m2_fitted.transform(validation_df)

ExtractFeatureImp(rf_m2_fit_model.featureImportances, m2_trans, "features").head(20)