# Analysis of Crime Reports in the City and County of Denver

#### Chris Richards
#### Practicum 2, Summer 2020
#### Regis University


### Support Vector Machine (SVM) Model

In [4]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn import svm


#### Read in data
The various date columns are saved to a list object before reading in the data.  The date object is passed as a parameter when loading the data which will result in correctly formatted date features.

In [5]:
dates = [7, 8, 9] # date fields to read in as date objects, not strings
df = pd.read_csv(r"E:\Regis\Practicum_2\df_cleaned.csv", parse_dates=dates)


#### Drop unused columns
The index, various dates, and the redundant "geo_x" and "geo_y" features are removed.  

In [6]:
df2 = df.drop(columns=['Unnamed: 0','first_occurrence_date', 'last_occurrence_date','reported_date', 'offense_type_id', 'incident_address', 'geo_x', 'geo_y'])
df2.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 371189 entries, 0 to 371188
Data columns (total 10 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   incident_id             371189 non-null  float64
 1   offense_id              371189 non-null  int64  
 2   offense_code            371189 non-null  int64  
 3   offense_code_extension  371189 non-null  int64  
 4   offense_category_id     371189 non-null  object 
 5   geo_lon                 371189 non-null  float64
 6   geo_lat                 371189 non-null  float64
 7   district_id             371189 non-null  int64  
 8   precinct_id             371189 non-null  int64  
 9   neighborhood            371189 non-null  object 
dtypes: float64(3), int64(5), object(2)
memory usage: 28.3+ MB


#### Split the target feature 
Create two dataframes - "rawX" with the independent variables, and "rawy" with the target variable.

In [7]:
rawX = df2.loc[:, df2.columns != 'offense_category_id'] # select all columns except for charges
rawy = df2['offense_category_id']

#### Encode the categorical variables
The "get_dummies" function will automatically identify the categorical variables in the data and one-hot encode them for use in the model.

In [8]:
X = pd.get_dummies(rawX)
#y = pd.get_dummies(rawy)
y = rawy

#### Create the test and training sets
The dataframe will be randomly sampled with 70% used for training and the remaining 30% reserved for testing the model.  

In [9]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=70)

#### Build the SVM model
The SVM will be built using a SVM classifier algorithim utilizing the radial-basis function kernel, "RBF".  The RBF kernel is commonly used in SVM classification models. 

In [10]:
#Create a svm Classifier
clf = svm.SVC(kernel='rbf') # Radial Basis Kernel

#Train the model using the training sets
clf.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

In [11]:
# Save model as a pickle file
import pickle
pkl_filename = "pickle_model.pkl"
with open(pkl_filename, 'wb') as file:
    pickle.dump(clf, file)

#### Get the accuracy score of the test predictions

In [12]:
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics

# Model Accuracy: how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.29431468160959795


The model has an accuracy of 29.43%.  

#### Load the model and predict again


In [13]:
# # Load from file
# with open(r"E:\Regis\Practicum_2\pickle_model.pkl", 'rb') as file:
#     pickle_model = pickle.load(file)
    
# # Calculate the accuracy score and predict target values
# score = pickle_model.score(X_test, y_test)
# print("Test score: {0:.2f} %".format(100 * score))

Test score: 29.43 %


### SVM with various kernels

In [None]:
C = 1.0  # SVM regularization parameter
 
# SVC with linear kernel
svc = svm.SVC(kernel='linear', C=C).fit(X_train, y_train)
# LinearSVC (linear kernel)
lin_svc = svm.LinearSVC(C=C).fit(X_train, y_train)
# SVC with RBF kernel
rbf_svc = svm.SVC(kernel='rbf', gamma=0.7, C=C).fit(X_train, y_train)
# SVC with polynomial (degree 3) kernel
poly_svc = svm.SVC(kernel='poly', degree=3, C=C).fit(X_train, y_train)

#### Predict on the test set for each model

In [None]:
#Predict the response for test dataset
y_pred_svc = svc.predict(X_test)
y_pred_lin_svc = lin_svc.predict(X_test)
y_pred_rbf = rbf_svc.predict(X_test)
y_pred_poly = poly_svc.predict(X_test)

#### Get the accuracy of the models

In [None]:
# Model Accuracy: how often is the classifier correct?
print("SVC Accuracy:",metrics.accuracy_score(y_test, y_pred_svc))
print("Linear SVC Accuracy:",metrics.accuracy_score(y_test, y_pred_lin_svc))
print("RBF SVC Accuracy:",metrics.accuracy_score(y_test, y_pred_rbf))
print("Poly SVC Accuracy:",metrics.accuracy_score(y_test, y_pred_poly))