In [1]:
import pandas as pd
import requests
import json
from pprint import pprint
import time

# Data Import
Calling API and Importing JSON File

In [2]:
url = "https://services5.arcgis.com/54falWtcpty3V47Z/arcgis/rest/services/general_offenses_year3/FeatureServer/0/query?where=1%3D1&outFields=*&outSR=4326&resultType=standard&f=json"
json = json.loads(requests.get(url).text)

In [3]:
json

{'objectIdFieldName': 'OBJECTID',
 'uniqueIdField': {'name': 'OBJECTID', 'isSystemMaintained': True},
 'globalIdFieldName': '',
 'fields': [{'name': 'OBJECTID',
   'type': 'esriFieldTypeOID',
   'alias': 'OBJECTID',
   'sqlType': 'sqlTypeOther',
   'domain': None,
   'defaultValue': None},
  {'name': 'Record_ID',
   'type': 'esriFieldTypeString',
   'alias': 'Record_ID',
   'sqlType': 'sqlTypeNVarchar',
   'length': 10,
   'domain': None,
   'defaultValue': None},
  {'name': 'Offense_Code',
   'type': 'esriFieldTypeString',
   'alias': 'Offense_Code',
   'sqlType': 'sqlTypeNVarchar',
   'length': 10,
   'domain': None,
   'defaultValue': None},
  {'name': 'Offense_Ext',
   'type': 'esriFieldTypeString',
   'alias': 'Offense_Ext',
   'sqlType': 'sqlTypeNVarchar',
   'length': 10,
   'domain': None,
   'defaultValue': None},
  {'name': 'Offense_Category',
   'type': 'esriFieldTypeString',
   'alias': 'Offense_Category',
   'sqlType': 'sqlTypeNVarchar',
   'length': 20,
   'domain': None,

# Displaying JSON File

# Reformating Data
Pulling values from each key and listing them in a seperate variable

In [4]:
Beat = []
Description = []
Grid = []
ObjectID = []
Occurence_Date = []
Offense_Category = []
Offense_Code = []
Offense_Ext = []
Police_District = []
Record_ID = []

for i in range(len(json['features'])):
    for k, v in json['features'][i]['attributes'].items():
        if k == 'Beat':
            Beat.append(v)
        if k == 'Description':
            Description.append(v)
        if k == 'Grid':
            Grid.append(v)
        if k == 'OBJECTID':
            ObjectID.append(v)
        if k == 'Occurence_Date':
            Occurence_Date.append(v)
        if k == 'Offense_Category':
            Offense_Category.append(v)
        if k == 'Offense_Code':
            Offense_Code.append(v)
        if k == 'Offense_Ext':
            Offense_Ext.append(v)
        if k == 'Police_District':
            Police_District.append(v)
        if k == 'Record_ID':
            Record_ID.append(v)

# Creating New Dictionary
Creating a dictionary with our new lists

In [5]:
crime_dict = {'Beat': Beat,
 'Description': Description,
 'Grid': Grid,
 'ObjectID': ObjectID,
 'Occurence_Date': Occurence_Date,
 'Offense_Category': Offense_Category,
 'Offense_Code': Offense_Code,
 'Offense_Ext': Offense_Ext,
 'Police_District': Police_District,
 'Record_ID': Record_ID}

# Transforming Dictionary to DataFrame

In [6]:
crime_df = pd.DataFrame(crime_dict).set_index('ObjectID')

In [7]:
crime_df = crime_df.loc[crime_df['Beat'] != 'UI']
crime_df.dropna(inplace=True)
crime_df.reset_index(drop=True, inplace=True)
crime_df.count()

Beat                14136
Description         14136
Grid                14136
Occurence_Date      14136
Offense_Category    14136
Offense_Code        14136
Offense_Ext         14136
Police_District     14136
Record_ID           14136
dtype: int64

In [8]:
# Reviewing data types
crime_df.dtypes

Beat                object
Description         object
Grid                object
Occurence_Date       int64
Offense_Category    object
Offense_Code        object
Offense_Ext         object
Police_District     object
Record_ID           object
dtype: object

# Converting Data Types

In [9]:
crime_df['Grid'] = crime_df['Grid'].astype('int')
crime_df['Occurence_Date'] = crime_df['Occurence_Date'].astype('str')
crime_df['Offense_Code'] = crime_df['Offense_Code'].astype('int')
crime_df['Police_District'] = crime_df['Police_District'].astype('int')
crime_df['Record_ID'] = crime_df['Record_ID'].astype('int')

In [10]:
crime_df.dtypes

Beat                object
Description         object
Grid                 int32
Occurence_Date      object
Offense_Category    object
Offense_Code         int32
Offense_Ext         object
Police_District      int32
Record_ID            int32
dtype: object

In [11]:
crime_df

Unnamed: 0,Beat,Description,Grid,Occurence_Date,Offense_Category,Offense_Code,Offense_Ext,Police_District,Record_ID
0,6B,246 PC SHOOT OCCUP DWELL/VEH,1401,1577866620000,WEAPON OFFENSE,5213,4,6,1429759
1,2B,594(A)(2) VANDALISM-DAMAGE,564,1577872380000,VANDALISM,2999,33,2,1429760
2,3M,69 PC RESIST/OBSTRCT EXEC OFC,741,1577872080000,OBSTRUCTING,4801,5,3,1429779
3,6C,10851(A)VC TAKE VEH W/O OWNER,1441,1577876400000,STOLEN VEHICLE,2404,0,6,1429804
4,3B,23153(A) DUI ALCOHOL W/INJ,777,1577875380000,TRAFFIC,5404,10,3,1429831
...,...,...,...,...,...,...,...,...,...
14131,1A,484 PC PETTY THEFT,302,1590813480000,LARCENY,2399,2,1,1459402
14132,4B,484 PC PETTY THEFT,1351,1589946600000,LARCENY,2399,2,4,1459418
14133,6C,20002(A) HIT/RUN-FAIL INFO,1447,1590714000000,TRAFFIC,5401,0,6,1459429
14134,6E,TRAFFIC ACCIDENT-INJURY,1102,1590547260000,TRAFFIC,5400,0,6,1459437


In [12]:
Occurence_Date = []
for i in range(len(crime_df)):
        trimmed_unix = crime_df['Occurence_Date'][i][0:10]
        date_time = time.strftime('%a %m-%d-%Y, %I:%M:%S %p', time.localtime(int(trimmed_unix)))
        Occurence_Date.append(date_time)

crime_df['Occurence_Date'] = Occurence_Date

In [13]:
crime_df

Unnamed: 0,Beat,Description,Grid,Occurence_Date,Offense_Category,Offense_Code,Offense_Ext,Police_District,Record_ID
0,6B,246 PC SHOOT OCCUP DWELL/VEH,1401,"Wed 01-01-2020, 12:17:00 AM",WEAPON OFFENSE,5213,4,6,1429759
1,2B,594(A)(2) VANDALISM-DAMAGE,564,"Wed 01-01-2020, 01:53:00 AM",VANDALISM,2999,33,2,1429760
2,3M,69 PC RESIST/OBSTRCT EXEC OFC,741,"Wed 01-01-2020, 01:48:00 AM",OBSTRUCTING,4801,5,3,1429779
3,6C,10851(A)VC TAKE VEH W/O OWNER,1441,"Wed 01-01-2020, 03:00:00 AM",STOLEN VEHICLE,2404,0,6,1429804
4,3B,23153(A) DUI ALCOHOL W/INJ,777,"Wed 01-01-2020, 02:43:00 AM",TRAFFIC,5404,10,3,1429831
...,...,...,...,...,...,...,...,...,...
14131,1A,484 PC PETTY THEFT,302,"Fri 05-29-2020, 09:38:00 PM",LARCENY,2399,2,1,1459402
14132,4B,484 PC PETTY THEFT,1351,"Tue 05-19-2020, 08:50:00 PM",LARCENY,2399,2,4,1459418
14133,6C,20002(A) HIT/RUN-FAIL INFO,1447,"Thu 05-28-2020, 06:00:00 PM",TRAFFIC,5401,0,6,1459429
14134,6E,TRAFFIC ACCIDENT-INJURY,1102,"Tue 05-26-2020, 07:41:00 PM",TRAFFIC,5400,0,6,1459437


In [14]:
crime_df['Occurence_day'] = pd.DatetimeIndex(crime_df['Occurence_Date']).dayofweek
crime_df['Occurence_dayName'] = pd.DatetimeIndex(crime_df['Occurence_Date']).weekday

crime_df['Occurence_dayName']

0        2
1        2
2        2
3        2
4        2
        ..
14131    4
14132    1
14133    3
14134    1
14135    5
Name: Occurence_dayName, Length: 14136, dtype: int64

In [15]:
crime_df['Occurence_hour'] = pd.DatetimeIndex(crime_df['Occurence_Date']).hour
crime_df['Occurence_hour']

0         0
1         1
2         1
3         3
4         2
         ..
14131    21
14132    20
14133    18
14134    19
14135    17
Name: Occurence_hour, Length: 14136, dtype: int64

In [16]:
import datetime as dt

In [17]:
dayOfWeek={0:'Monday', 1:'Tuesday', 2:'Wednesday', 3:'Thursday', 4:'Friday', 5:'Saturday', 6:'Sunday'}
crime_df['Occurence_dayName'] = crime_df['Occurence_day'].map(dayOfWeek)

crime_df['Occurence_dayName']

0        Wednesday
1        Wednesday
2        Wednesday
3        Wednesday
4        Wednesday
           ...    
14131       Friday
14132      Tuesday
14133     Thursday
14134      Tuesday
14135     Saturday
Name: Occurence_dayName, Length: 14136, dtype: object

In [18]:
crime_df.head()

Unnamed: 0,Beat,Description,Grid,Occurence_Date,Offense_Category,Offense_Code,Offense_Ext,Police_District,Record_ID,Occurence_day,Occurence_dayName,Occurence_hour
0,6B,246 PC SHOOT OCCUP DWELL/VEH,1401,"Wed 01-01-2020, 12:17:00 AM",WEAPON OFFENSE,5213,4,6,1429759,2,Wednesday,0
1,2B,594(A)(2) VANDALISM-DAMAGE,564,"Wed 01-01-2020, 01:53:00 AM",VANDALISM,2999,33,2,1429760,2,Wednesday,1
2,3M,69 PC RESIST/OBSTRCT EXEC OFC,741,"Wed 01-01-2020, 01:48:00 AM",OBSTRUCTING,4801,5,3,1429779,2,Wednesday,1
3,6C,10851(A)VC TAKE VEH W/O OWNER,1441,"Wed 01-01-2020, 03:00:00 AM",STOLEN VEHICLE,2404,0,6,1429804,2,Wednesday,3
4,3B,23153(A) DUI ALCOHOL W/INJ,777,"Wed 01-01-2020, 02:43:00 AM",TRAFFIC,5404,10,3,1429831,2,Wednesday,2


# Placing Data Frame into a CSV

In [19]:
crime_df.to_csv('sac_crime', index=False)

# Machine Learning

In [50]:
import warnings
warnings.simplefilter('ignore')

import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
style.use("ggplot")

In [51]:
X = crime_df[['Police_District','Beat','Grid','Occurence_day','Occurence_hour']]
y = crime_df["Offense_Category"]

crime_df["Offense_Category"]

0        WEAPON OFFENSE
1             VANDALISM
2           OBSTRUCTING
3        STOLEN VEHICLE
4               TRAFFIC
              ...      
14131           LARCENY
14132           LARCENY
14133           TRAFFIC
14134           TRAFFIC
14135    STOLEN VEHICLE
Name: Offense_Category, Length: 14136, dtype: object

Train_test_split to create training and testing data

In [52]:
from sklearn.model_selection import train_test_split

#Using get_dummies to convert non numerical data
X = pd.get_dummies(X)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=42)

In [53]:
X_train.head()

Unnamed: 0,Police_District,Grid,Occurence_day,Occurence_hour,Beat_1A,Beat_1B,Beat_1C,Beat_2A,Beat_2B,Beat_2C,...,Beat_4B,Beat_4C,Beat_5A,Beat_5B,Beat_5C,Beat_6A,Beat_6B,Beat_6C,Beat_6D,Beat_6E
5743,3,724,3,23,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
12192,6,877,5,17,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
5169,2,626,3,20,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
7195,1,328,5,20,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2564,3,717,3,21,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Data Preprocessing

In [54]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler

label_encoder = LabelEncoder()
label_encoder.fit(y)
label_encoder.classes_

X_scaler = StandardScaler().fit(X_train)

Scale both the training and testing data

In [55]:
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

One-hot encode the labels

In [56]:
from tensorflow.keras.utils import to_categorical

In [35]:
# One-hot encoding

# Step 1: Label-encode data set

encoded_y_train = label_encoder.transform(y_train)
encoded_y_test = label_encoder.transform(y_test)

y_train_categorical = to_categorical(encoded_y_train)
y_test_categorical = to_categorical(encoded_y_test)
y_train_categorical

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

## Defining our Model Architecture (the layers)

We first need to create a sequential model

In [36]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential()

Next, we add our first layer. This layer requires you to specify both the number of inputs and the number of nodes that you want in the hidden layer.

In [37]:
y_train_categorical.shape

(10602, 51)

In [38]:
number_inputs = 24


model.add(Dense(units=100, activation='relu', input_dim=24))
model.add(Dense(units=100, activation='relu'))
model.add(Dense(units=51, activation='softmax'))

In [39]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 100)               2500      
_________________________________________________________________
dense_1 (Dense)              (None, 100)               10100     
_________________________________________________________________
dense_2 (Dense)              (None, 51)                5151      
Total params: 17,751
Trainable params: 17,751
Non-trainable params: 0
_________________________________________________________________


## Compile the Model

Now that we have our model architecture defined, we must compile the model using a loss function and optimizer. We can also specify additional training metrics such as accuracy.

In [40]:
# Use categorical crossentropy for categorical data and mean squared error for regression
# Hint: your output layer in this example is using software for logistic regression (categorical)
# If your output layer activation was `linear` then you may want to use `mse` for loss
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

## Training the Model
Finally, we train our model using our training data

Training consists of updating our weights using our optimizer and loss function. In this example, we choose 1000 iterations (loops) of training that are called epochs.

We also choose to shuffle our training data and increase the detail printed out during each training cycle.

In [41]:
y_train_categorical.shape

(10602, 51)

In [58]:
# Fit (train) the model
model.fit(
    X_train_scaled,
    y_train_categorical,
    epochs=50,
    shuffle=True,
    verbose=2
)

Epoch 1/50
332/332 - 1s - loss: 1.8840 - accuracy: 0.3208
Epoch 2/50
332/332 - 1s - loss: 1.8803 - accuracy: 0.3192
Epoch 3/50
332/332 - 1s - loss: 1.8797 - accuracy: 0.3230
Epoch 4/50
332/332 - 1s - loss: 1.8779 - accuracy: 0.3185
Epoch 5/50
332/332 - 1s - loss: 1.8801 - accuracy: 0.3240
Epoch 6/50
332/332 - 1s - loss: 1.8789 - accuracy: 0.3264
Epoch 7/50
332/332 - 1s - loss: 1.8791 - accuracy: 0.3221
Epoch 8/50
332/332 - 1s - loss: 1.8805 - accuracy: 0.3195
Epoch 9/50
332/332 - 1s - loss: 1.8822 - accuracy: 0.3210
Epoch 10/50
332/332 - 1s - loss: 1.8777 - accuracy: 0.3238
Epoch 11/50
332/332 - 1s - loss: 1.8755 - accuracy: 0.3231
Epoch 12/50
332/332 - 1s - loss: 1.8815 - accuracy: 0.3171
Epoch 13/50
332/332 - 1s - loss: 1.8794 - accuracy: 0.3229
Epoch 14/50
332/332 - 1s - loss: 1.8806 - accuracy: 0.3225
Epoch 15/50
332/332 - 1s - loss: 1.8832 - accuracy: 0.3200
Epoch 16/50
332/332 - 1s - loss: 1.8806 - accuracy: 0.3195
Epoch 17/50
332/332 - 1s - loss: 1.8796 - accuracy: 0.3195
Epoch 

<tensorflow.python.keras.callbacks.History at 0x1e53abe7fc8>

## Quantifying the Model
We use our testing data to validate our model. This is how we determine the validity of our model (i.e. the ability to predict new and previously unseen data points)

In [59]:
# Evaluate the model using the testing data
model_loss, model_accuracy = model.evaluate(
    X_test_scaled, y_test_categorical, verbose=5)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Loss: 5.686732769012451, Accuracy: 0.16921335458755493
