**CPSC 4660 Project Notebook**

Import all needed libraries.

In [1]:
import numpy as np
import pandas as pd
import math
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV
from sklearn.ensemble import RandomForestClassifier

Print the initial, unmodified table from the csv.

In [2]:
crimes = pd.read_csv('crime.csv')
print("Crime in Vancouver (2003-2017)")
display(crimes)

Crime in Vancouver (2003-2017)


Unnamed: 0,TYPE,YEAR,MONTH,DAY,HOUR,MINUTE,HUNDRED_BLOCK,NEIGHBOURHOOD,X,Y,Latitude,Longitude
0,Other Theft,2003,5,12,16.0,15.0,9XX TERMINAL AVE,Strathcona,493906.50,5457452.47,49.269802,-123.083763
1,Other Theft,2003,5,7,15.0,20.0,9XX TERMINAL AVE,Strathcona,493906.50,5457452.47,49.269802,-123.083763
2,Other Theft,2003,4,23,16.0,40.0,9XX TERMINAL AVE,Strathcona,493906.50,5457452.47,49.269802,-123.083763
3,Other Theft,2003,4,20,11.0,15.0,9XX TERMINAL AVE,Strathcona,493906.50,5457452.47,49.269802,-123.083763
4,Other Theft,2003,4,12,17.0,45.0,9XX TERMINAL AVE,Strathcona,493906.50,5457452.47,49.269802,-123.083763
...,...,...,...,...,...,...,...,...,...,...,...,...
530647,Break and Enter Residential/Other,2017,3,3,9.0,16.0,31XX ADANAC ST,Hastings-Sunrise,497265.49,5458296.71,49.277420,-123.037595
530648,Mischief,2017,5,29,22.0,30.0,14XX E 7TH AVE,Grandview-Woodland,494533.97,5456824.97,49.264163,-123.075129
530649,Offence Against a Person,2017,4,13,,,OFFSET TO PROTECT PRIVACY,,0.00,0.00,0.000000,0.000000
530650,Theft from Vehicle,2017,6,5,17.0,0.0,8XX HAMILTON ST,Central Business District,491487.85,5458385.78,49.278168,-123.117031


In this dataset, the crime 'Offence Against a Person' has all data missing except year, month, and day any time it appears. This has been done to protect the privacy of those whom the offence was commited against. Unfortunately, this means that information provided from records of this type of crime will not be useful in providing predictive information. So, despite the fact that records containing 'Offence Against a Person' make up 10.2% of the crime in this dataset (54,142 out of 530,652 rows), this crime will have to be removed from the dataset.

In [3]:
crimes = crimes[crimes.TYPE != 'Offence Against a Person']
display(crimes)

Unnamed: 0,TYPE,YEAR,MONTH,DAY,HOUR,MINUTE,HUNDRED_BLOCK,NEIGHBOURHOOD,X,Y,Latitude,Longitude
0,Other Theft,2003,5,12,16.0,15.0,9XX TERMINAL AVE,Strathcona,493906.50,5457452.47,49.269802,-123.083763
1,Other Theft,2003,5,7,15.0,20.0,9XX TERMINAL AVE,Strathcona,493906.50,5457452.47,49.269802,-123.083763
2,Other Theft,2003,4,23,16.0,40.0,9XX TERMINAL AVE,Strathcona,493906.50,5457452.47,49.269802,-123.083763
3,Other Theft,2003,4,20,11.0,15.0,9XX TERMINAL AVE,Strathcona,493906.50,5457452.47,49.269802,-123.083763
4,Other Theft,2003,4,12,17.0,45.0,9XX TERMINAL AVE,Strathcona,493906.50,5457452.47,49.269802,-123.083763
...,...,...,...,...,...,...,...,...,...,...,...,...
530646,Mischief,2017,1,18,14.0,44.0,14XX E HASTINGS ST,Grandview-Woodland,494563.75,5458727.40,49.281276,-123.074746
530647,Break and Enter Residential/Other,2017,3,3,9.0,16.0,31XX ADANAC ST,Hastings-Sunrise,497265.49,5458296.71,49.277420,-123.037595
530648,Mischief,2017,5,29,22.0,30.0,14XX E 7TH AVE,Grandview-Woodland,494533.97,5456824.97,49.264163,-123.075129
530650,Theft from Vehicle,2017,6,5,17.0,0.0,8XX HAMILTON ST,Central Business District,491487.85,5458385.78,49.278168,-123.117031


The dataset being worked with should be as complete as possible. So, as a precaution, any potential records containing null/NA/NaN values that may remain in the dataset need to be removed.

In [4]:
crimes.dropna(inplace=True)
display(crimes)

Unnamed: 0,TYPE,YEAR,MONTH,DAY,HOUR,MINUTE,HUNDRED_BLOCK,NEIGHBOURHOOD,X,Y,Latitude,Longitude
0,Other Theft,2003,5,12,16.0,15.0,9XX TERMINAL AVE,Strathcona,493906.50,5457452.47,49.269802,-123.083763
1,Other Theft,2003,5,7,15.0,20.0,9XX TERMINAL AVE,Strathcona,493906.50,5457452.47,49.269802,-123.083763
2,Other Theft,2003,4,23,16.0,40.0,9XX TERMINAL AVE,Strathcona,493906.50,5457452.47,49.269802,-123.083763
3,Other Theft,2003,4,20,11.0,15.0,9XX TERMINAL AVE,Strathcona,493906.50,5457452.47,49.269802,-123.083763
4,Other Theft,2003,4,12,17.0,45.0,9XX TERMINAL AVE,Strathcona,493906.50,5457452.47,49.269802,-123.083763
...,...,...,...,...,...,...,...,...,...,...,...,...
530646,Mischief,2017,1,18,14.0,44.0,14XX E HASTINGS ST,Grandview-Woodland,494563.75,5458727.40,49.281276,-123.074746
530647,Break and Enter Residential/Other,2017,3,3,9.0,16.0,31XX ADANAC ST,Hastings-Sunrise,497265.49,5458296.71,49.277420,-123.037595
530648,Mischief,2017,5,29,22.0,30.0,14XX E 7TH AVE,Grandview-Woodland,494533.97,5456824.97,49.264163,-123.075129
530650,Theft from Vehicle,2017,6,5,17.0,0.0,8XX HAMILTON ST,Central Business District,491487.85,5458385.78,49.278168,-123.117031


Now the number of times each crime type appears in the dataset needs to be determined. This information will be used to reduce the number of crimes being used in the model so that it can be more focused on the crime that occurs most frequently.

In [5]:
crime_type_counts = crimes.TYPE.value_counts()
print(crime_type_counts)
crimes_size = len(crimes.index)
crime_types = crime_type_counts.index
crime_counts = crime_type_counts.values
crime_percentages = (crime_counts / crimes_size) * 100
crime_percentages_len = len(crime_percentages)
for i in range(crime_percentages_len):
    crime_percentages[i] = round(crime_percentages[i], 2)
for i in range(crime_percentages_len):
    print(f'{crime_types[i]:<55}', f'{"{0:.2f}".format(crime_percentages[i]):>6}', '%')

Theft from Vehicle                                        170889
Mischief                                                   70157
Break and Enter Residential/Other                          60856
Other Theft                                                52160
Theft of Vehicle                                           38351
Break and Enter Commercial                                 33841
Theft of Bicycle                                           25620
Vehicle Collision or Pedestrian Struck (with Injury)       21887
Vehicle Collision or Pedestrian Struck (with Fatality)       254
Name: TYPE, dtype: int64
Theft from Vehicle                                       36.05 %
Mischief                                                 14.80 %
Break and Enter Residential/Other                        12.84 %
Other Theft                                              11.00 %
Theft of Vehicle                                          8.09 %
Break and Enter Commercial                                7.14 %


From the above calculations, it can be seen that the top few crimes make up the bulk of this dataset. So, in order to create a more focused model, we will take the top 4 of 9 crimes. The reasoning behind this is that these 4 crimes each make up more than 10% of the dataset and together they make up (36.05 + 14.80 + 12.84 + 11.00) = 74.69% of the dataset. Thus, we will remove the bottom 5 crimes from the dataset.

In [6]:
crimes = crimes[(crimes.TYPE == 'Theft from Vehicle') | (crimes.TYPE == 'Mischief') | (crimes.TYPE == 'Break and Enter Residential/Other') | (crimes.TYPE == 'Other Theft')]
display(crimes)

Unnamed: 0,TYPE,YEAR,MONTH,DAY,HOUR,MINUTE,HUNDRED_BLOCK,NEIGHBOURHOOD,X,Y,Latitude,Longitude
0,Other Theft,2003,5,12,16.0,15.0,9XX TERMINAL AVE,Strathcona,493906.50,5457452.47,49.269802,-123.083763
1,Other Theft,2003,5,7,15.0,20.0,9XX TERMINAL AVE,Strathcona,493906.50,5457452.47,49.269802,-123.083763
2,Other Theft,2003,4,23,16.0,40.0,9XX TERMINAL AVE,Strathcona,493906.50,5457452.47,49.269802,-123.083763
3,Other Theft,2003,4,20,11.0,15.0,9XX TERMINAL AVE,Strathcona,493906.50,5457452.47,49.269802,-123.083763
4,Other Theft,2003,4,12,17.0,45.0,9XX TERMINAL AVE,Strathcona,493906.50,5457452.47,49.269802,-123.083763
...,...,...,...,...,...,...,...,...,...,...,...,...
530645,Break and Enter Residential/Other,2017,6,17,12.0,29.0,30XX W 7TH AVE,Kitsilano,487435.85,5457028.69,49.265891,-123.172697
530646,Mischief,2017,1,18,14.0,44.0,14XX E HASTINGS ST,Grandview-Woodland,494563.75,5458727.40,49.281276,-123.074746
530647,Break and Enter Residential/Other,2017,3,3,9.0,16.0,31XX ADANAC ST,Hastings-Sunrise,497265.49,5458296.71,49.277420,-123.037595
530648,Mischief,2017,5,29,22.0,30.0,14XX E 7TH AVE,Grandview-Woodland,494533.97,5456824.97,49.264163,-123.075129


Some data can have further information extracted from it. In our case, extracting a DayOfWeek (values 1-7 for Monday-Sunday) and WeekOfYear (1-53) attribute may help in producing more accurate predicitions. In order to do so, we will also have to convert the HOUR and MINUTE values from floats to ints.

In [7]:
crimes = crimes.astype({"HOUR": int, "MINUTE": int})

In [8]:
crime_dayofweek = []
crime_weekofyear = []
for i, r in crimes.iterrows():
    crime_datetime = datetime(r['YEAR'], r['MONTH'], r['DAY'], r['HOUR'], r['MINUTE'])
    crime_dayofweek.append(crime_datetime.isocalendar()[2])
    crime_weekofyear.append(crime_datetime.isocalendar()[1])
crimes.insert(6, 'DayOfWeek', crime_dayofweek)
crimes.insert(7, 'WeekOfYear', crime_weekofyear)
display(crimes)

Unnamed: 0,TYPE,YEAR,MONTH,DAY,HOUR,MINUTE,DayOfWeek,WeekOfYear,HUNDRED_BLOCK,NEIGHBOURHOOD,X,Y,Latitude,Longitude
0,Other Theft,2003,5,12,16,15,1,20,9XX TERMINAL AVE,Strathcona,493906.50,5457452.47,49.269802,-123.083763
1,Other Theft,2003,5,7,15,20,3,19,9XX TERMINAL AVE,Strathcona,493906.50,5457452.47,49.269802,-123.083763
2,Other Theft,2003,4,23,16,40,3,17,9XX TERMINAL AVE,Strathcona,493906.50,5457452.47,49.269802,-123.083763
3,Other Theft,2003,4,20,11,15,7,16,9XX TERMINAL AVE,Strathcona,493906.50,5457452.47,49.269802,-123.083763
4,Other Theft,2003,4,12,17,45,6,15,9XX TERMINAL AVE,Strathcona,493906.50,5457452.47,49.269802,-123.083763
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
530645,Break and Enter Residential/Other,2017,6,17,12,29,6,24,30XX W 7TH AVE,Kitsilano,487435.85,5457028.69,49.265891,-123.172697
530646,Mischief,2017,1,18,14,44,3,3,14XX E HASTINGS ST,Grandview-Woodland,494563.75,5458727.40,49.281276,-123.074746
530647,Break and Enter Residential/Other,2017,3,3,9,16,5,9,31XX ADANAC ST,Hastings-Sunrise,497265.49,5458296.71,49.277420,-123.037595
530648,Mischief,2017,5,29,22,30,1,22,14XX E 7TH AVE,Grandview-Woodland,494533.97,5456824.97,49.264163,-123.075129


In order to make this dataset compatible with the algorithms we will be using, we need to convert the crime types to numerical values (Theft from Vehicle = 0, Mischief = 1, Break and Enter Residential/Other = 2, Other Theft = 3).

In [9]:
crimes['TYPE'].replace(['Theft from Vehicle', 'Mischief', 'Break and Enter Residential/Other', 'Other Theft'], [0, 1, 2, 3], inplace=True)
display(crimes)

Unnamed: 0,TYPE,YEAR,MONTH,DAY,HOUR,MINUTE,DayOfWeek,WeekOfYear,HUNDRED_BLOCK,NEIGHBOURHOOD,X,Y,Latitude,Longitude
0,3,2003,5,12,16,15,1,20,9XX TERMINAL AVE,Strathcona,493906.50,5457452.47,49.269802,-123.083763
1,3,2003,5,7,15,20,3,19,9XX TERMINAL AVE,Strathcona,493906.50,5457452.47,49.269802,-123.083763
2,3,2003,4,23,16,40,3,17,9XX TERMINAL AVE,Strathcona,493906.50,5457452.47,49.269802,-123.083763
3,3,2003,4,20,11,15,7,16,9XX TERMINAL AVE,Strathcona,493906.50,5457452.47,49.269802,-123.083763
4,3,2003,4,12,17,45,6,15,9XX TERMINAL AVE,Strathcona,493906.50,5457452.47,49.269802,-123.083763
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
530645,2,2017,6,17,12,29,6,24,30XX W 7TH AVE,Kitsilano,487435.85,5457028.69,49.265891,-123.172697
530646,1,2017,1,18,14,44,3,3,14XX E HASTINGS ST,Grandview-Woodland,494563.75,5458727.40,49.281276,-123.074746
530647,2,2017,3,3,9,16,5,9,31XX ADANAC ST,Hastings-Sunrise,497265.49,5458296.71,49.277420,-123.037595
530648,1,2017,5,29,22,30,1,22,14XX E 7TH AVE,Grandview-Woodland,494533.97,5456824.97,49.264163,-123.075129


Removing any remaining categorical data so that we can work with strictly numerical data.

In [10]:
crimes.pop('HUNDRED_BLOCK') 
crimes.pop('NEIGHBOURHOOD')

0                        Strathcona
1                        Strathcona
2                        Strathcona
3                        Strathcona
4                        Strathcona
                    ...            
530645                    Kitsilano
530646           Grandview-Woodland
530647             Hastings-Sunrise
530648           Grandview-Woodland
530650    Central Business District
Name: NEIGHBOURHOOD, Length: 354062, dtype: object

Now, we will split the dataset into 2 sets. The first dataset is the dataset being used to train up the model. The second dataset is the dataset which will be used for evaluating accuracy of the algorithm. These will be split up psuedorandomly in order to have a "random" split that will be consistent every time the code is run. The first step in doing this is to split the dataset into the features to look at and the target which will be predicted. The second step is to split the overall dataset into train and test sections.

In [11]:
features = ['YEAR', 'MONTH', 'DAY', 'HOUR', 'MINUTE', 'DayOfWeek', 'WeekOfYear', 'X', 'Y', 'Latitude', 'Longitude']
X = crimes.loc[:, features]
target = ['TYPE']
Y = crimes.loc[:, target]
display(X)
display(Y)

Unnamed: 0,YEAR,MONTH,DAY,HOUR,MINUTE,DayOfWeek,WeekOfYear,X,Y,Latitude,Longitude
0,2003,5,12,16,15,1,20,493906.50,5457452.47,49.269802,-123.083763
1,2003,5,7,15,20,3,19,493906.50,5457452.47,49.269802,-123.083763
2,2003,4,23,16,40,3,17,493906.50,5457452.47,49.269802,-123.083763
3,2003,4,20,11,15,7,16,493906.50,5457452.47,49.269802,-123.083763
4,2003,4,12,17,45,6,15,493906.50,5457452.47,49.269802,-123.083763
...,...,...,...,...,...,...,...,...,...,...,...
530645,2017,6,17,12,29,6,24,487435.85,5457028.69,49.265891,-123.172697
530646,2017,1,18,14,44,3,3,494563.75,5458727.40,49.281276,-123.074746
530647,2017,3,3,9,16,5,9,497265.49,5458296.71,49.277420,-123.037595
530648,2017,5,29,22,30,1,22,494533.97,5456824.97,49.264163,-123.075129


Unnamed: 0,TYPE
0,3
1,3
2,3
3,3
4,3
...,...
530645,2
530646,1
530647,2
530648,1


In [12]:
# Splitting the dataset into test and train sections. 75% going to train and 25% going to test.
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state = 0, train_size = .75)
display(X_train)
display(X_test)
display(Y_train)
display(Y_test)

Unnamed: 0,YEAR,MONTH,DAY,HOUR,MINUTE,DayOfWeek,WeekOfYear,X,Y,Latitude,Longitude
160946,2006,4,1,15,0,6,13,491259.23,5459007.75,49.283760,-123.120188
508344,2016,6,1,17,0,3,22,495148.02,5454528.42,49.243511,-123.066661
348814,2011,12,19,15,0,1,51,490247.74,5456234.11,49.258795,-123.134028
74383,2004,1,13,15,43,2,3,491827.08,5459079.55,49.284414,-123.112381
359096,2012,2,29,18,16,3,9,491295.24,5458744.77,49.281395,-123.119687
...,...,...,...,...,...,...,...,...,...,...,...
184432,2006,2,19,19,37,7,7,492220.96,5456883.18,49.264662,-123.106922
457333,2015,8,6,13,30,4,32,491407.74,5458871.84,49.282539,-123.118143
228945,2008,7,4,7,0,5,27,489876.35,5450301.37,49.205423,-123.138982
175655,2006,11,6,17,32,1,45,491394.12,5458844.26,49.282291,-123.118329


Unnamed: 0,YEAR,MONTH,DAY,HOUR,MINUTE,DayOfWeek,WeekOfYear,X,Y,Latitude,Longitude
50989,2004,1,8,20,0,4,2,488974.18,5457668.25,49.271674,-123.151570
93153,2004,9,30,20,0,4,40,491651.57,5458615.15,49.280234,-123.114785
285997,2009,6,26,18,0,5,26,493681.15,5458813.86,49.282046,-123.086883
398506,2013,8,7,16,0,3,32,491355.26,5460631.89,49.298371,-123.118902
331370,2011,5,16,20,16,1,20,490797.66,5459282.31,49.286223,-123.126541
...,...,...,...,...,...,...,...,...,...,...,...
462265,2015,10,24,0,0,6,43,490116.26,5459123.65,49.284785,-123.135906
7976,2003,9,22,23,0,1,39,490092.82,5459451.96,49.287738,-123.136237
56138,2004,3,10,17,15,3,11,495041.52,5458609.51,49.280220,-123.068175
46218,2003,7,17,17,0,4,29,489997.09,5456450.44,49.260737,-123.137478


Unnamed: 0,TYPE
160946,0
508344,2
348814,0
74383,3
359096,3
...,...
184432,1
457333,3
228945,2
175655,3


Unnamed: 0,TYPE
50989,2
93153,0
285997,0
398506,0
331370,3
...,...
462265,0
7976,1
56138,1
46218,0


Before we begin processing the data, we will display our classes for the implemented algorithms.

Naive Bayes

In [13]:
class NaiveBayes:
    
    def NBpredict(self, A):
        yARG = [self._NBpredict(a) for a in A]#This will itterate through every posibility of a that will be used to calculate P(a[1..n]|b)
        return np.array(yARG)#Note: the return must be an array for the accuracy calculation. I over looked this and took 45 mins to correct :(
   
        
    def _NBpredict(self, a): #You need this function because for some reason you are not able to nest the for loops with a in A. Thus we make a second function
        pAB_values = [] #Stores all the possible answers of our probibility calculations 
        for i in range(len(self._classes)):
            j = self._classes[i] #Going through the array of unique values
            prior = np.log(self._priors[i])
            pAB = np.sum(np.log(self.NPgaussian(i, a)))
            pAB = pAB + prior
            pAB_values.append(pAB)

        return self._classes[np.argmax(pAB_values)] # return the highest P(A|B) to the calling function
    
    def NPgaussian(self, index, a):
        meanNew = self._mean[index] #Select the mean at the given index from our FIT
        varNew = self._var[index] #Select the variance at the given index from our FIT
        
        #Make a fancy mathematical expression for the gaussian equation calculation
        numerator = 1 * np.exp(-((a - meanNew) ** 2) / (2 * varNew))
        denominator = varNew * np.sqrt(2 * np.pi)
        
        return (numerator/denominator) 
    
    def NBfit(self, A, b):
        #assignes the integer value of rows to NumRows and the integer value of columbs to numCol
        numRows = A.shape[0]
        numCol = A.shape[1]
        #specific classifier of the class = find the unique elements of an array and return that array sorted to us
        self._classes = np.unique(b)
        #will return the number of classes (ie the lenght of the class identifier that is unique)
        numClass = len(self._classes)
    
        #We calculate the mean, variance and frequency for each unique columb , data type is numpy float64 bit (dtype = np.float64)
        self._mean = np.zeros((numClass, numCol)) #Creates a zero filled matrix of size numClass x numCol
        self._var = np.zeros((numClass, numCol)) #Creates a zero filled matrix of size numClass x numCol
        self._priors = np.zeros(numClass) #Creates a zero filled array/1D matrix of size numClass
    
        #This loop will actually do the calculating of the above zero matrices
        for i in range(numClass):
            j = self._classes[i] #Going through the array of unique values
            classXj = A[b == j] #The base array of crime type == index selected value of crime type - calculate and store in the zero matrix above
            self._mean[i,] = classXj.mean()
            self._var[i,] = classXj.var()
            self._priors[i] = classXj.shape[0] / float(numRows) #Number of unique rows we have divided by the total number of rows we have 

Recursive Feature Elimination

In [14]:
# Simplified class for Recursive Feature Elimination that uses RandomForestClassifier
class Recursive_Feature_Elimination:
    
    # Initiailize object with the desired number of features to select and the step size
    # The step size determines how many features to drop per iteration of the algorithm
    def __init__(self, n_features_to_select = 0, step = 1):
        self.estimator_ = RandomForestClassifier() # Estimator used for the class: set to RandomForestClassifer
        self.n_features_to_select_ = n_features_to_select # Number of features being selected
        self.step_ = step # Step size
        self.n_features_ = 0 # The number of features selected
        self.ranking_ = [] # Rank of features: chosen features are 1, and non-chosen numbers are ranked from 2 - number of features in the dataset
        self.support_ = [] # Boolean representation of ranking: chosen features are True, non-chosen features are False
    
    # Function that actually performs the Recursive Feature Elimination
    # Calls fit on the estimator recursively to find feature importances and removes unimportant features until only the desired number of features remain
    def fit(self, X, y):
        if self.n_features_to_select_ <= 0: # If n_features_to_select_ is set <= 0: default it to half the size of X
            self.n_features_to_select_ = math.ceil(len(X.columns) / 2)
        self.n_features_ = len(X.columns) # Set n_features_ to the number of features in X
        self.ranking_ = np.zeros(self.n_features_) # Initialize ranking_ to an array of zeros of size n_features_
        self.support_ = np.array([False] * self.n_features_) # Initialize support_ to an arry of False's of size n_features_
        X_copy = X.copy() # Copy X so that we do not modify it in our calculations
        while (self.n_features_ > self.n_features_to_select_):
            self.estimator_.fit(X_copy, y) # Call the fit function of the estimator object
            fi = self.estimator_.feature_importances_ # Store feature_importances_ from most recent call of fit in fi
            for i in range(self.step_): # Remove step_ amount of features for next calculation and mark their rank
                if self.n_features_ > self.n_features_to_select_: # Only perform these steps if it will not remove needed features
                    lowest_feature_loc = X.columns.get_loc(X_copy.columns[np.argmin(fi)]) # Index in ranking_ of the least important feature
                    self.ranking_[lowest_feature_loc] = self.n_features_ - self.n_features_to_select_ + 1 # Assign rank for least important feature to its index in ranking_
                    X_copy.drop(X_copy.columns[np.argmin(fi)], axis = 1, inplace = True) # Drop least important feature from X_copy
                    fi = np.delete(fi, np.argmin(fi)) # Remove least important feature from feature importance array
                    self.n_features_ = len(X_copy.columns) # Update n_features_
        self.ranking_[self.ranking_ < 1] = 1 # Mark all remaining features as 1
        for i in range(len(self.support_)): # Update support_ to be True at each index that ranking_ is 1
            if self.ranking_[i] == 1:
                self.support_[i] = True
        
    # Reduces the X to only contain the important columns as determined by the fit function
    def transform(self, X):
        X_copy = X.copy() # Copy X so that it is not modified
        for i in range(len(self.ranking_)): # Drop unimportant features from X_copy
            if self.support_[i] != True:
                X_copy.drop(X.columns[i], axis = 1, inplace = True)
        return X_copy # Return the modified version of X
    
    # Calls fit on X and y, then transforms X and returns it
    def fit_transform(self, X, y):
        self.fit(X, y)
        new_data = self.transform(X)
        return new_data

First, we will compare our implementation of Naive Bayes to the implementation found in SKLearn based on accuracy.

SKLearn

In [15]:
accuracy = []
precision = []
recall = []
f1 = []

In [16]:
gnb = GaussianNB()
gnb.fit(X_train, Y_train.values.ravel())
Y_pred = gnb.predict(X_test)
Y_predSKNB = Y_pred.copy()

accuracy.append(accuracy_score(Y_test, Y_pred))
precision.append(precision_score(Y_test, Y_pred, average='macro'))
recall.append(recall_score(Y_test, Y_pred, average='macro'))
f1.append(f1_score(Y_test, Y_pred, average='macro'))

Ours

In [17]:
gnb = NaiveBayes()
gnb.NBfit(X_train, Y_train.values.ravel())
X_test2 = X_test.to_numpy()
Y_pred = gnb.NBpredict(X_test2)
Y_predNB = Y_pred.copy()

accuracy.append(accuracy_score(Y_test, Y_pred)) 
precision.append(precision_score(Y_test, Y_pred, average='macro'))
recall.append(recall_score(Y_test, Y_pred, average='macro'))
f1.append(f1_score(Y_test, Y_pred, average='macro'))

Next, we will compare our version of Recursive Feature Elimination to the one in SKLearn.

SKLearn

In [18]:
estimator = RandomForestClassifier()
selector_SK = RFE(estimator)
selector_SK = selector_SK.fit(X_train, Y_train.values.ravel())
display(selector_SK.ranking_)
display(selector_SK.support_)

array([5, 7, 4, 3, 2, 6, 1, 1, 1, 1, 1])

array([False, False, False, False, False, False,  True,  True,  True,
        True,  True])

Now that we have the optimal number of features and the results from SKLearn RFE, we will use the number of features with our own RFE algorihtm.

Ours

In [19]:
selector_OU = Recursive_Feature_Elimination()
selector_OU.fit(X_train, Y_train.values.ravel())
display(selector_OU.ranking_)
display(selector_OU.support_)

array([4., 6., 3., 2., 1., 5., 1., 1., 1., 1., 1.])

array([False, False, False, False,  True, False,  True,  True,  True,
        True,  True])

From here, we will transform X_test and X_train using the SKLearn RFE and see its effects on both implementations of Naive Bayes.

In [20]:
X_test_SK = selector_SK.transform(X_test)
X_train_SK = selector_SK.transform(X_train)

SKLearn

In [21]:
gnb = GaussianNB()
gnb.fit(X_train_SK, Y_train.values.ravel())
Y_pred = gnb.predict(X_test_SK)

accuracy.append(accuracy_score(Y_test, Y_pred)) 
precision.append(precision_score(Y_test, Y_pred, average='macro'))
recall.append(recall_score(Y_test, Y_pred, average='macro'))
f1.append(f1_score(Y_test, Y_pred, average='macro'))

  _warn_prf(average, modifier, msg_start, len(result))


Ours

In [22]:
gnb = NaiveBayes()
gnb.NBfit(X_train_SK, Y_train.values.ravel())
Y_pred = gnb.NBpredict(X_test_SK)

accuracy.append(accuracy_score(Y_test, Y_pred)) 
precision.append(precision_score(Y_test, Y_pred, average='macro'))
recall.append(recall_score(Y_test, Y_pred, average='macro'))
f1.append(f1_score(Y_test, Y_pred, average='macro'))

  _warn_prf(average, modifier, msg_start, len(result))


Now, we will transform X_test and X_train using our implementation of RFE and see its effects on both implementations of Naive Bayes.

In [23]:
X_test_OU = selector_OU.transform(X_test)
X_train_OU = selector_OU.transform(X_train)

SKLearn

In [24]:
gnb = GaussianNB()
gnb.fit(X_train_OU, Y_train.values.ravel())
Y_pred = gnb.predict(X_test_OU)
Y_predSKRFE = Y_pred.copy()

accuracy.append(accuracy_score(Y_test, Y_pred)) 
precision.append(precision_score(Y_test, Y_pred, average='macro'))
recall.append(recall_score(Y_test, Y_pred, average='macro'))
f1.append(f1_score(Y_test, Y_pred, average='macro'))

Ours

In [25]:
gnb = NaiveBayes()
gnb.NBfit(X_train_OU, Y_train.values.ravel())
X_test_OU2 = X_test_OU.to_numpy()
Y_pred = gnb.NBpredict(X_test_OU2)
Y_predRFE = Y_pred.copy()

accuracy.append(accuracy_score(Y_test, Y_pred)) 
precision.append(precision_score(Y_test, Y_pred, average='macro'))
recall.append(recall_score(Y_test, Y_pred, average='macro'))
f1.append(f1_score(Y_test, Y_pred, average='macro'))

Here we have a comparsion of all the different combinations according to the metrics: accuracy, precision, recall, and F1 Score.

In [26]:
index_values = ['Accuracy', 'Precision', 'Recall', 'F1_Score']
column_values = ['SK NB', 'Our NB', 'SK NB w/ SK RFE', 'Our NB w/ SK RFE', 'SK NB w/ Our RFE', 'Our NB w/ Our RFE']
metrics = pd.DataFrame(data = [accuracy, precision, recall, f1], index=index_values, columns=column_values)
metrics

Unnamed: 0,SK NB,Our NB,SK NB w/ SK RFE,Our NB w/ SK RFE,SK NB w/ Our RFE,Our NB w/ Our RFE
Accuracy,0.493628,0.436023,0.465498,0.48084,0.479631,0.425618
Precision,0.412902,0.404622,0.181253,0.12021,0.34754,0.331506
Recall,0.356008,0.39067,0.25625,0.25,0.318783,0.332515
F1_Score,0.341525,0.331569,0.191399,0.162354,0.288146,0.290744


We will now join Y_pred, Y_test, and X_test to see a snapshot of the predictions versus the actual results from both our Naive Bayes algorithm and our Naive Bayes algorithm with Recursive Feature Elimination.

In [27]:
Y_predNB = pd.DataFrame(Y_predNB)
Y_predNB.columns = ["Predicted Type"]
Y_predNB.replace([0, 1, 2, 3], ['Theft from Vehicle', 'Mischief', 'Break and Enter Residential/Other', 'Other Theft'], inplace=True)
Y_predRFE = pd.DataFrame(Y_predRFE)
Y_predRFE.columns = ["Predicted Type"]
Y_predRFE.replace([0, 1, 2, 3], ['Theft from Vehicle', 'Mischief', 'Break and Enter Residential/Other', 'Other Theft'], inplace=True)
Y_test.columns = ["Actual Type"]
Y_test.replace([0, 1, 2, 3], ['Theft from Vehicle', 'Mischief', 'Break and Enter Residential/Other', 'Other Theft'], inplace=True)
NB = pd.concat([Y_predNB.reset_index(drop=True) ,Y_test.reset_index(drop=True), X_test.reset_index(drop=True)], axis=1)
RFE = pd.concat([Y_predRFE.reset_index(drop=True) ,Y_test.reset_index(drop=True), X_test.reset_index(drop=True)], axis=1)

First, we will list Predictions vs Actual on Theft from Vehicle.

In [28]:
display("Original Naive Bayes: Theft from Vehicle", NB[NB['Predicted Type'] == "Theft from Vehicle"])
display("Naive Bayes with Recursive Feature Elimination: Theft from Vehicle", RFE[RFE['Predicted Type'] == "Theft from Vehicle"])

'Original Naive Bayes: Theft from Vehicle'

Unnamed: 0,Predicted Type,Actual Type,YEAR,MONTH,DAY,HOUR,MINUTE,DayOfWeek,WeekOfYear,X,Y,Latitude,Longitude
0,Theft from Vehicle,Break and Enter Residential/Other,2004,1,8,20,0,4,2,488974.18,5457668.25,49.271674,-123.151570
1,Theft from Vehicle,Theft from Vehicle,2004,9,30,20,0,4,40,491651.57,5458615.15,49.280234,-123.114785
2,Theft from Vehicle,Theft from Vehicle,2009,6,26,18,0,5,26,493681.15,5458813.86,49.282046,-123.086883
3,Theft from Vehicle,Theft from Vehicle,2013,8,7,16,0,3,32,491355.26,5460631.89,49.298371,-123.118902
6,Theft from Vehicle,Theft from Vehicle,2005,6,27,19,0,1,26,489359.90,5457818.89,49.273036,-123.146272
...,...,...,...,...,...,...,...,...,...,...,...,...,...
88507,Theft from Vehicle,Theft from Vehicle,2014,11,30,17,0,7,48,491008.90,5458315.77,49.277532,-123.123614
88510,Theft from Vehicle,Other Theft,2015,10,7,7,8,3,41,497308.77,5456155.92,49.258164,-123.036986
88511,Theft from Vehicle,Theft from Vehicle,2015,10,24,0,0,6,43,490116.26,5459123.65,49.284785,-123.135906
88512,Theft from Vehicle,Mischief,2003,9,22,23,0,1,39,490092.82,5459451.96,49.287738,-123.136237


'Naive Bayes with Recursive Feature Elimination: Theft from Vehicle'

Unnamed: 0,Predicted Type,Actual Type,YEAR,MONTH,DAY,HOUR,MINUTE,DayOfWeek,WeekOfYear,X,Y,Latitude,Longitude
0,Theft from Vehicle,Break and Enter Residential/Other,2004,1,8,20,0,4,2,488974.18,5457668.25,49.271674,-123.151570
1,Theft from Vehicle,Theft from Vehicle,2004,9,30,20,0,4,40,491651.57,5458615.15,49.280234,-123.114785
2,Theft from Vehicle,Theft from Vehicle,2009,6,26,18,0,5,26,493681.15,5458813.86,49.282046,-123.086883
3,Theft from Vehicle,Theft from Vehicle,2013,8,7,16,0,3,32,491355.26,5460631.89,49.298371,-123.118902
4,Theft from Vehicle,Other Theft,2011,5,16,20,16,1,20,490797.66,5459282.31,49.286223,-123.126541
...,...,...,...,...,...,...,...,...,...,...,...,...,...
88510,Theft from Vehicle,Other Theft,2015,10,7,7,8,3,41,497308.77,5456155.92,49.258164,-123.036986
88511,Theft from Vehicle,Theft from Vehicle,2015,10,24,0,0,6,43,490116.26,5459123.65,49.284785,-123.135906
88512,Theft from Vehicle,Mischief,2003,9,22,23,0,1,39,490092.82,5459451.96,49.287738,-123.136237
88513,Theft from Vehicle,Mischief,2004,3,10,17,15,3,11,495041.52,5458609.51,49.280220,-123.068175


Next, we will list Predictions vs Actual on Mischief.

In [29]:
display("Original Naive Bayes: Mischief", NB[NB['Predicted Type'] == "Mischief"])
display("Naive Bayes with Recursive Feature Elimination: Mischief", RFE[RFE['Predicted Type'] == "Mischief"])

'Original Naive Bayes: Mischief'

Unnamed: 0,Predicted Type,Actual Type,YEAR,MONTH,DAY,HOUR,MINUTE,DayOfWeek,WeekOfYear,X,Y,Latitude,Longitude
32,Mischief,Mischief,2015,1,17,1,53,6,3,490470.88,5458197.99,49.276464,-123.131008
37,Mischief,Mischief,2009,2,17,2,30,2,8,496297.61,5458207.06,49.276609,-123.050901
63,Mischief,Mischief,2008,8,21,1,50,4,34,489905.08,5456038.01,49.257026,-123.138732
126,Mischief,Break and Enter Residential/Other,2013,8,29,2,47,4,35,492726.16,5458199.65,49.276510,-123.100002
132,Mischief,Mischief,2012,9,11,4,55,2,37,496210.84,5456579.20,49.261965,-123.052079
...,...,...,...,...,...,...,...,...,...,...,...,...,...
88065,Mischief,Break and Enter Residential/Other,2011,5,11,1,48,3,19,492192.12,5457576.84,49.270901,-123.107332
88130,Mischief,Theft from Vehicle,2008,2,17,0,30,7,7,495083.83,5457524.49,49.270460,-123.067580
88214,Mischief,Theft from Vehicle,2017,5,21,6,35,7,20,498073.38,5456160.71,49.258210,-123.026478
88413,Mischief,Theft from Vehicle,2013,10,12,1,30,6,41,494830.61,5456932.79,49.265136,-123.071054


'Naive Bayes with Recursive Feature Elimination: Mischief'

Unnamed: 0,Predicted Type,Actual Type,YEAR,MONTH,DAY,HOUR,MINUTE,DayOfWeek,WeekOfYear,X,Y,Latitude,Longitude
160,Mischief,Other Theft,2008,8,5,10,55,2,32,493399.03,5452635.98,49.226471,-123.090660
554,Mischief,Theft from Vehicle,2016,8,6,21,35,6,31,498186.19,5458384.70,49.278215,-123.024937
699,Mischief,Other Theft,2008,5,15,19,57,4,20,492603.78,5452467.75,49.224949,-123.101579
1205,Mischief,Break and Enter Residential/Other,2011,6,23,18,49,4,25,497227.78,5455512.38,49.252375,-123.038094
1595,Mischief,Mischief,2008,6,10,5,48,2,24,494380.79,5453316.85,49.232606,-123.077186
...,...,...,...,...,...,...,...,...,...,...,...,...,...
87329,Mischief,Other Theft,2005,2,16,12,55,3,7,493391.44,5452727.13,49.227291,-123.090766
87540,Mischief,Other Theft,2003,11,26,20,43,3,48,495513.37,5454530.92,49.243536,-123.061642
87666,Mischief,Mischief,2003,9,12,22,40,5,37,498106.86,5458704.55,49.281092,-123.026030
87859,Mischief,Mischief,2008,2,9,18,41,6,6,497807.78,5456721.84,49.263256,-123.030131


Next, we will list Predictions vs Actual on Break and Enter Residential/Other.

In [30]:
display("Original Naive Bayes: Break and Enter Residential/Other", NB[NB['Predicted Type'] == "Break and Enter Residential/Other"])
display("Naive Bayes with Recursive Feature Elimination: Break and Enter Residential/Other", RFE[RFE['Predicted Type'] == "Break and Enter Residential/Other"])

'Original Naive Bayes: Break and Enter Residential/Other'

Unnamed: 0,Predicted Type,Actual Type,YEAR,MONTH,DAY,HOUR,MINUTE,DayOfWeek,WeekOfYear,X,Y,Latitude,Longitude
11,Break and Enter Residential/Other,Theft from Vehicle,2005,2,18,18,0,5,7,489537.22,5451258.02,49.214022,-123.143663
15,Break and Enter Residential/Other,Mischief,2007,10,11,1,41,4,41,490750.27,5450638.13,49.208466,-123.126993
16,Break and Enter Residential/Other,Theft from Vehicle,2005,8,15,16,30,1,33,490186.83,5450074.65,49.203389,-123.134714
21,Break and Enter Residential/Other,Break and Enter Residential/Other,2006,1,2,1,30,1,1,494653.13,5452495.27,49.225218,-123.073434
22,Break and Enter Residential/Other,Break and Enter Residential/Other,2007,2,5,9,0,1,6,496487.36,5454838.51,49.246309,-123.048263
...,...,...,...,...,...,...,...,...,...,...,...,...,...
88473,Break and Enter Residential/Other,Mischief,2013,4,27,19,54,6,17,498038.64,5453379.95,49.233196,-123.026942
88474,Break and Enter Residential/Other,Other Theft,2009,1,28,20,31,3,5,489762.32,5450762.71,49.209571,-123.140560
88478,Break and Enter Residential/Other,Theft from Vehicle,2004,7,13,19,0,2,29,498245.72,5455410.44,49.251461,-123.024106
88497,Break and Enter Residential/Other,Theft from Vehicle,2008,5,11,3,15,7,19,494995.46,5451114.71,49.212802,-123.068715


'Naive Bayes with Recursive Feature Elimination: Break and Enter Residential/Other'

Unnamed: 0,Predicted Type,Actual Type,YEAR,MONTH,DAY,HOUR,MINUTE,DayOfWeek,WeekOfYear,X,Y,Latitude,Longitude
11,Break and Enter Residential/Other,Theft from Vehicle,2005,2,18,18,0,5,7,489537.22,5451258.02,49.214022,-123.143663
15,Break and Enter Residential/Other,Mischief,2007,10,11,1,41,4,41,490750.27,5450638.13,49.208466,-123.126993
16,Break and Enter Residential/Other,Theft from Vehicle,2005,8,15,16,30,1,33,490186.83,5450074.65,49.203389,-123.134714
21,Break and Enter Residential/Other,Break and Enter Residential/Other,2006,1,2,1,30,1,1,494653.13,5452495.27,49.225218,-123.073434
31,Break and Enter Residential/Other,Mischief,2005,11,19,4,0,6,46,485895.79,5453328.33,49.232572,-123.193736
...,...,...,...,...,...,...,...,...,...,...,...,...,...
88478,Break and Enter Residential/Other,Theft from Vehicle,2004,7,13,19,0,2,29,498245.72,5455410.44,49.251461,-123.024106
88482,Break and Enter Residential/Other,Theft from Vehicle,2011,2,26,20,0,6,8,496806.45,5453976.59,49.238558,-123.043872
88497,Break and Enter Residential/Other,Theft from Vehicle,2008,5,11,3,15,7,19,494995.46,5451114.71,49.212802,-123.068715
88501,Break and Enter Residential/Other,Theft from Vehicle,2006,2,15,14,10,3,7,492648.10,5450886.95,49.210730,-123.100941


Finally, we will list Predictions vs Actual on Other Theft.

In [31]:
display("Original Naive Bayes: Other Theft", NB[NB['Predicted Type'] == "Other Theft"])
display("Naive Bayes with Recursive Feature Elimination: Other Theft", RFE[RFE['Predicted Type'] == "Other Theft"])

'Original Naive Bayes: Other Theft'

Unnamed: 0,Predicted Type,Actual Type,YEAR,MONTH,DAY,HOUR,MINUTE,DayOfWeek,WeekOfYear,X,Y,Latitude,Longitude
4,Other Theft,Other Theft,2011,5,16,20,16,1,20,490797.66,5459282.31,49.286223,-123.126541
5,Other Theft,Mischief,2015,12,28,15,3,1,53,494384.28,5458209.26,49.276614,-123.077206
10,Other Theft,Theft from Vehicle,2008,7,19,18,47,6,29,491833.57,5459286.88,49.286279,-123.112296
17,Other Theft,Break and Enter Residential/Other,2007,10,1,17,15,1,40,490729.83,5458622.34,49.280285,-123.127458
19,Other Theft,Theft from Vehicle,2014,4,6,18,0,7,14,493065.48,5458615.63,49.280256,-123.095344
...,...,...,...,...,...,...,...,...,...,...,...,...,...
88505,Other Theft,Mischief,2006,10,8,14,45,7,40,496487.75,5458028.74,49.275006,-123.048286
88508,Other Theft,Theft from Vehicle,2015,10,22,18,0,4,43,494935.90,5457413.51,49.269461,-123.069612
88509,Other Theft,Theft from Vehicle,2011,10,10,22,30,1,41,492580.71,5458321.63,49.277606,-123.102004
88513,Other Theft,Mischief,2004,3,10,17,15,3,11,495041.52,5458609.51,49.280220,-123.068175


'Naive Bayes with Recursive Feature Elimination: Other Theft'

Unnamed: 0,Predicted Type,Actual Type,YEAR,MONTH,DAY,HOUR,MINUTE,DayOfWeek,WeekOfYear,X,Y,Latitude,Longitude
10,Other Theft,Theft from Vehicle,2008,7,19,18,47,6,29,491833.57,5459286.88,49.286279,-123.112296
23,Other Theft,Theft from Vehicle,2009,12,21,11,56,1,52,490955.33,5459482.91,49.288030,-123.124377
24,Other Theft,Theft from Vehicle,2004,5,11,7,30,2,20,492257.60,5458266.56,49.277106,-123.106445
29,Other Theft,Other Theft,2007,2,24,14,43,6,8,491827.08,5459079.55,49.284414,-123.112381
30,Other Theft,Theft from Vehicle,2017,6,16,18,35,5,24,490951.41,5456220.14,49.258680,-123.124357
...,...,...,...,...,...,...,...,...,...,...,...,...,...
88492,Other Theft,Theft from Vehicle,2008,5,25,23,30,7,21,493711.74,5458908.08,49.282894,-123.086463
88493,Other Theft,Break and Enter Residential/Other,2008,2,15,12,44,5,7,492627.03,5454817.01,49.246081,-123.101303
88505,Other Theft,Mischief,2006,10,8,14,45,7,40,496487.75,5458028.74,49.275006,-123.048286
88509,Other Theft,Theft from Vehicle,2011,10,10,22,30,1,41,492580.71,5458321.63,49.277606,-123.102004


We will now perform the same join using the SKLearn Naive Bayes with our implementation of Recursive Feature Elimination.

In [32]:
Y_predSKNB = pd.DataFrame(Y_predSKNB)
Y_predSKNB.columns = ["Predicted Type"]
Y_predSKNB.replace([0, 1, 2, 3], ['Theft from Vehicle', 'Mischief', 'Break and Enter Residential/Other', 'Other Theft'], inplace=True)
Y_predSKRFE = pd.DataFrame(Y_predSKRFE)
Y_predSKRFE.columns = ["Predicted Type"]
Y_predSKRFE.replace([0, 1, 2, 3], ['Theft from Vehicle', 'Mischief', 'Break and Enter Residential/Other', 'Other Theft'], inplace=True)
Y_test.columns = ["Actual Type"]
Y_test.replace([0, 1, 2, 3], ['Theft from Vehicle', 'Mischief', 'Break and Enter Residential/Other', 'Other Theft'], inplace=True)
SKNB = pd.concat([Y_predSKNB.reset_index(drop=True) ,Y_test.reset_index(drop=True), X_test.reset_index(drop=True)], axis=1)
SKRFE = pd.concat([Y_predSKRFE.reset_index(drop=True) ,Y_test.reset_index(drop=True), X_test.reset_index(drop=True)], axis=1)

In [33]:
display("Original Naive Bayes: Theft from Vehicle", SKNB[SKNB['Predicted Type'] == "Theft from Vehicle"])
display("Naive Bayes with Recursive Feature Elimination: Theft from Vehicle", SKRFE[SKRFE['Predicted Type'] == "Theft from Vehicle"])

'Original Naive Bayes: Theft from Vehicle'

Unnamed: 0,Predicted Type,Actual Type,YEAR,MONTH,DAY,HOUR,MINUTE,DayOfWeek,WeekOfYear,X,Y,Latitude,Longitude
0,Theft from Vehicle,Break and Enter Residential/Other,2004,1,8,20,0,4,2,488974.18,5457668.25,49.271674,-123.151570
1,Theft from Vehicle,Theft from Vehicle,2004,9,30,20,0,4,40,491651.57,5458615.15,49.280234,-123.114785
2,Theft from Vehicle,Theft from Vehicle,2009,6,26,18,0,5,26,493681.15,5458813.86,49.282046,-123.086883
3,Theft from Vehicle,Theft from Vehicle,2013,8,7,16,0,3,32,491355.26,5460631.89,49.298371,-123.118902
4,Theft from Vehicle,Other Theft,2011,5,16,20,16,1,20,490797.66,5459282.31,49.286223,-123.126541
...,...,...,...,...,...,...,...,...,...,...,...,...,...
88510,Theft from Vehicle,Other Theft,2015,10,7,7,8,3,41,497308.77,5456155.92,49.258164,-123.036986
88511,Theft from Vehicle,Theft from Vehicle,2015,10,24,0,0,6,43,490116.26,5459123.65,49.284785,-123.135906
88512,Theft from Vehicle,Mischief,2003,9,22,23,0,1,39,490092.82,5459451.96,49.287738,-123.136237
88513,Theft from Vehicle,Mischief,2004,3,10,17,15,3,11,495041.52,5458609.51,49.280220,-123.068175


'Naive Bayes with Recursive Feature Elimination: Theft from Vehicle'

Unnamed: 0,Predicted Type,Actual Type,YEAR,MONTH,DAY,HOUR,MINUTE,DayOfWeek,WeekOfYear,X,Y,Latitude,Longitude
0,Theft from Vehicle,Break and Enter Residential/Other,2004,1,8,20,0,4,2,488974.18,5457668.25,49.271674,-123.151570
1,Theft from Vehicle,Theft from Vehicle,2004,9,30,20,0,4,40,491651.57,5458615.15,49.280234,-123.114785
2,Theft from Vehicle,Theft from Vehicle,2009,6,26,18,0,5,26,493681.15,5458813.86,49.282046,-123.086883
3,Theft from Vehicle,Theft from Vehicle,2013,8,7,16,0,3,32,491355.26,5460631.89,49.298371,-123.118902
4,Theft from Vehicle,Other Theft,2011,5,16,20,16,1,20,490797.66,5459282.31,49.286223,-123.126541
...,...,...,...,...,...,...,...,...,...,...,...,...,...
88510,Theft from Vehicle,Other Theft,2015,10,7,7,8,3,41,497308.77,5456155.92,49.258164,-123.036986
88511,Theft from Vehicle,Theft from Vehicle,2015,10,24,0,0,6,43,490116.26,5459123.65,49.284785,-123.135906
88512,Theft from Vehicle,Mischief,2003,9,22,23,0,1,39,490092.82,5459451.96,49.287738,-123.136237
88513,Theft from Vehicle,Mischief,2004,3,10,17,15,3,11,495041.52,5458609.51,49.280220,-123.068175


In [34]:
display("Original Naive Bayes: Mischief", SKNB[SKNB['Predicted Type'] == "Mischief"])
display("Naive Bayes with Recursive Feature Elimination: Mischief", SKRFE[SKRFE['Predicted Type'] == "Mischief"])

'Original Naive Bayes: Mischief'

Unnamed: 0,Predicted Type,Actual Type,YEAR,MONTH,DAY,HOUR,MINUTE,DayOfWeek,WeekOfYear,X,Y,Latitude,Longitude
32,Mischief,Mischief,2015,1,17,1,53,6,3,490470.88,5458197.99,49.276464,-123.131008
37,Mischief,Mischief,2009,2,17,2,30,2,8,496297.61,5458207.06,49.276609,-123.050901
63,Mischief,Mischief,2008,8,21,1,50,4,34,489905.08,5456038.01,49.257026,-123.138732
71,Mischief,Theft from Vehicle,2017,4,28,6,45,5,17,491792.73,5459252.20,49.285966,-123.112857
118,Mischief,Theft from Vehicle,2015,1,19,5,51,1,4,495339.83,5456209.14,49.258631,-123.064046
...,...,...,...,...,...,...,...,...,...,...,...,...,...
88230,Mischief,Theft from Vehicle,2005,5,16,5,58,1,20,491758.44,5459079.59,49.284413,-123.113325
88265,Mischief,Theft from Vehicle,2007,6,25,7,47,1,26,491789.20,5458959.71,49.283335,-123.112899
88413,Mischief,Theft from Vehicle,2013,10,12,1,30,6,41,494830.61,5456932.79,49.265136,-123.071054
88426,Mischief,Mischief,2013,8,23,1,40,5,34,493177.29,5453039.25,49.230096,-123.093712


'Naive Bayes with Recursive Feature Elimination: Mischief'

Unnamed: 0,Predicted Type,Actual Type,YEAR,MONTH,DAY,HOUR,MINUTE,DayOfWeek,WeekOfYear,X,Y,Latitude,Longitude
160,Mischief,Other Theft,2008,8,5,10,55,2,32,493399.03,5452635.98,49.226471,-123.090660
162,Mischief,Break and Enter Residential/Other,2004,7,30,17,56,5,31,498126.75,5459571.69,49.288892,-123.025760
165,Mischief,Theft from Vehicle,2012,6,24,20,45,7,25,492588.75,5453315.09,49.232571,-123.101801
448,Mischief,Theft from Vehicle,2013,10,8,18,50,2,41,493844.89,5454164.30,49.240224,-123.084560
465,Mischief,Mischief,2015,4,27,3,56,1,18,487404.45,5455141.38,49.248914,-123.173070
...,...,...,...,...,...,...,...,...,...,...,...,...,...
88060,Mischief,Other Theft,2010,3,6,12,45,6,9,495228.06,5453412.77,49.233476,-123.065549
88086,Mischief,Break and Enter Residential/Other,2008,8,10,5,57,7,32,495604.91,5453347.97,49.232896,-123.060371
88141,Mischief,Break and Enter Residential/Other,2017,4,13,22,51,4,15,487467.43,5455037.32,49.247979,-123.172201
88172,Mischief,Mischief,2003,12,5,18,45,5,49,497949.85,5458807.78,49.282020,-123.028189


In [35]:
display("Original Naive Bayes: Break and Enter Residential/Other", SKNB[SKNB['Predicted Type'] == "Break and Enter Residential/Other"])
display("Naive Bayes with Recursive Feature Elimination: Break and Enter Residential/Other", SKRFE[SKRFE['Predicted Type'] == "Break and Enter Residential/Other"])

'Original Naive Bayes: Break and Enter Residential/Other'

Unnamed: 0,Predicted Type,Actual Type,YEAR,MONTH,DAY,HOUR,MINUTE,DayOfWeek,WeekOfYear,X,Y,Latitude,Longitude
15,Break and Enter Residential/Other,Mischief,2007,10,11,1,41,4,41,490750.27,5450638.13,49.208466,-123.126993
16,Break and Enter Residential/Other,Theft from Vehicle,2005,8,15,16,30,1,33,490186.83,5450074.65,49.203389,-123.134714
21,Break and Enter Residential/Other,Break and Enter Residential/Other,2006,1,2,1,30,1,1,494653.13,5452495.27,49.225218,-123.073434
31,Break and Enter Residential/Other,Mischief,2005,11,19,4,0,6,46,485895.79,5453328.33,49.232572,-123.193736
36,Break and Enter Residential/Other,Break and Enter Residential/Other,2015,12,5,18,30,6,49,497546.85,5452720.38,49.227261,-123.033693
...,...,...,...,...,...,...,...,...,...,...,...,...,...
88455,Break and Enter Residential/Other,Break and Enter Residential/Other,2014,8,31,12,45,7,35,497955.56,5450997.34,49.211764,-123.028071
88470,Break and Enter Residential/Other,Theft from Vehicle,2004,11,28,5,32,7,48,489975.05,5450990.73,49.211625,-123.137645
88472,Break and Enter Residential/Other,Mischief,2014,3,10,14,0,1,11,497995.76,5453265.15,49.232163,-123.027530
88474,Break and Enter Residential/Other,Other Theft,2009,1,28,20,31,3,5,489762.32,5450762.71,49.209571,-123.140560


'Naive Bayes with Recursive Feature Elimination: Break and Enter Residential/Other'

Unnamed: 0,Predicted Type,Actual Type,YEAR,MONTH,DAY,HOUR,MINUTE,DayOfWeek,WeekOfYear,X,Y,Latitude,Longitude
15,Break and Enter Residential/Other,Mischief,2007,10,11,1,41,4,41,490750.27,5450638.13,49.208466,-123.126993
16,Break and Enter Residential/Other,Theft from Vehicle,2005,8,15,16,30,1,33,490186.83,5450074.65,49.203389,-123.134714
36,Break and Enter Residential/Other,Break and Enter Residential/Other,2015,12,5,18,30,6,49,497546.85,5452720.38,49.227261,-123.033693
50,Break and Enter Residential/Other,Mischief,2014,5,27,18,0,2,22,497388.56,5451589.82,49.217091,-123.035860
53,Break and Enter Residential/Other,Mischief,2003,12,19,18,50,5,51,489250.18,5453366.65,49.232985,-123.147661
...,...,...,...,...,...,...,...,...,...,...,...,...,...
88470,Break and Enter Residential/Other,Theft from Vehicle,2004,11,28,5,32,7,48,489975.05,5450990.73,49.211625,-123.137645
88472,Break and Enter Residential/Other,Mischief,2014,3,10,14,0,1,11,497995.76,5453265.15,49.232163,-123.027530
88473,Break and Enter Residential/Other,Mischief,2013,4,27,19,54,6,17,498038.64,5453379.95,49.233196,-123.026942
88474,Break and Enter Residential/Other,Other Theft,2009,1,28,20,31,3,5,489762.32,5450762.71,49.209571,-123.140560


In [36]:
display("Original Naive Bayes: Other Theft", SKNB[SKNB['Predicted Type'] == "Other Theft"])
display("Naive Bayes with Recursive Feature Elimination: Other Theft", SKRFE[SKRFE['Predicted Type'] == "Other Theft"])

'Original Naive Bayes: Other Theft'

Unnamed: 0,Predicted Type,Actual Type,YEAR,MONTH,DAY,HOUR,MINUTE,DayOfWeek,WeekOfYear,X,Y,Latitude,Longitude
10,Other Theft,Theft from Vehicle,2008,7,19,18,47,6,29,491833.57,5459286.88,49.286279,-123.112296
23,Other Theft,Theft from Vehicle,2009,12,21,11,56,1,52,490955.33,5459482.91,49.288030,-123.124377
29,Other Theft,Other Theft,2007,2,24,14,43,6,8,491827.08,5459079.55,49.284414,-123.112381
30,Other Theft,Theft from Vehicle,2017,6,16,18,35,5,24,490951.41,5456220.14,49.258680,-123.124357
33,Other Theft,Other Theft,2017,6,14,17,46,3,24,490240.33,5459772.54,49.290624,-123.134216
...,...,...,...,...,...,...,...,...,...,...,...,...,...
88487,Other Theft,Other Theft,2007,1,9,12,44,2,2,491394.12,5458844.26,49.282291,-123.118329
88490,Other Theft,Mischief,2012,11,23,14,39,5,47,493605.51,5455356.03,49.250941,-123.087867
88493,Other Theft,Break and Enter Residential/Other,2008,2,15,12,44,5,7,492627.03,5454817.01,49.246081,-123.101303
88505,Other Theft,Mischief,2006,10,8,14,45,7,40,496487.75,5458028.74,49.275006,-123.048286


'Naive Bayes with Recursive Feature Elimination: Other Theft'

Unnamed: 0,Predicted Type,Actual Type,YEAR,MONTH,DAY,HOUR,MINUTE,DayOfWeek,WeekOfYear,X,Y,Latitude,Longitude
10,Other Theft,Theft from Vehicle,2008,7,19,18,47,6,29,491833.57,5459286.88,49.286279,-123.112296
23,Other Theft,Theft from Vehicle,2009,12,21,11,56,1,52,490955.33,5459482.91,49.288030,-123.124377
29,Other Theft,Other Theft,2007,2,24,14,43,6,8,491827.08,5459079.55,49.284414,-123.112381
32,Other Theft,Mischief,2015,1,17,1,53,6,3,490470.88,5458197.99,49.276464,-123.131008
33,Other Theft,Other Theft,2017,6,14,17,46,3,24,490240.33,5459772.54,49.290624,-123.134216
...,...,...,...,...,...,...,...,...,...,...,...,...,...
88480,Other Theft,Other Theft,2009,3,17,14,39,2,12,491445.91,5458896.41,49.282761,-123.117618
88487,Other Theft,Other Theft,2007,1,9,12,44,2,2,491394.12,5458844.26,49.282291,-123.118329
88493,Other Theft,Break and Enter Residential/Other,2008,2,15,12,44,5,7,492627.03,5454817.01,49.246081,-123.101303
88505,Other Theft,Mischief,2006,10,8,14,45,7,40,496487.75,5458028.74,49.275006,-123.048286
