In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler,OneHotEncoder
import pandas as pd
import tensorflow as tf


In [2]:
# Import our input dataset
df = pd.read_csv('../Resources/tableau_data.csv')
df.head()

Unnamed: 0,Registration State,Plate Type,Violation Code,Vehicle Body Type,Vehicle Make,Violation Time,Vehicle Color,Vehicle Year
0,NY,PAS,67,SDN,TOYOT,10:37 AM,BLK,2004
1,NY,PAS,51,SUBN,JEEP,10:45 AM,GRY,2017
2,NY,PAS,67,SUBN,TOYOT,12:05 PM,OTH,2021
3,NY,PAS,98,SUBN,NISSA,05:35 AM,BLK,2002
4,FL,PAS,51,SUBN,SUBAR,03:20 AM,GRY,2005


In [3]:
# Generate our categorical variable list
df_cat = df.dtypes[df.dtypes == "object"].index.tolist()

In [4]:
# Check the number of unique values in each column
df[df_cat].nunique()

Registration State      8
Plate Type              9
Vehicle Body Type       9
Vehicle Make           13
Violation Time        784
Vehicle Color           8
dtype: int64

In [5]:
#convert to violation time to dataetime format
import datetime
df["Violation Time"] = pd.to_datetime(df["Violation Time"], format='%H:%M %p' )

In [6]:
#converting datetime into minutes passed 12:00 AM
df['Violation Time'] = df['Violation Time'].apply(lambda x: (x.to_pydatetime()-datetime.datetime(1900,1,1)).total_seconds()/60)
df.head(10)

Unnamed: 0,Registration State,Plate Type,Violation Code,Vehicle Body Type,Vehicle Make,Violation Time,Vehicle Color,Vehicle Year
0,NY,PAS,67,SDN,TOYOT,637.0,BLK,2004
1,NY,PAS,51,SUBN,JEEP,645.0,GRY,2017
2,NY,PAS,67,SUBN,TOYOT,725.0,OTH,2021
3,NY,PAS,98,SUBN,NISSA,335.0,BLK,2002
4,FL,PAS,51,SUBN,SUBAR,200.0,GRY,2005
5,NY,PAS,63,SUBN,CHEVR,662.0,WHT,2010
6,NY,PAS,45,SDN,NISSA,366.0,BLK,2002
7,NY,OMS,14,SDN,TOYOT,770.0,BLK,2022
8,NY,PAS,20,SUBN,HONDA,631.0,RED,2005
9,NY,PAS,27,SDN,NISSA,367.0,BLK,2022


In [7]:
df_cat = df.dtypes[df.dtypes == "object"].index.tolist()
df[df_cat].nunique()

Registration State     8
Plate Type             9
Vehicle Body Type      9
Vehicle Make          13
Vehicle Color          8
dtype: int64

In [12]:
# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(df[df_cat]))

# Add the encoded variable names to the DataFrame
encode_df.columns = enc.get_feature_names(df_cat)
encode_df.head()



Unnamed: 0,Registration State_CT,Registration State_FL,Registration State_GA,Registration State_NJ,Registration State_NY,Registration State_OTHER,Registration State_PA,Registration State_TX,Plate Type_APP,Plate Type_COM,...,Vehicle Make_SUBAR,Vehicle Make_TOYOT,Vehicle Color_BLK,Vehicle Color_BLU,Vehicle Color_GRN,Vehicle Color_GRY,Vehicle Color_ORG,Vehicle Color_OTH,Vehicle Color_RED,Vehicle Color_WHT
0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [13]:
# Merge one-hot encoded features and drop the originals
nn_df = df.merge(encode_df,left_index=True, right_index=True)
nn_df = nn_df.drop(df_cat,1)
nn_df.head()

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,Violation Code,Violation Time,Vehicle Year,Registration State_CT,Registration State_FL,Registration State_GA,Registration State_NJ,Registration State_NY,Registration State_OTHER,Registration State_PA,...,Vehicle Make_SUBAR,Vehicle Make_TOYOT,Vehicle Color_BLK,Vehicle Color_BLU,Vehicle Color_GRN,Vehicle Color_GRY,Vehicle Color_ORG,Vehicle Color_OTH,Vehicle Color_RED,Vehicle Color_WHT
0,67,637.0,2004,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,51,645.0,2017,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,67,725.0,2021,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,98,335.0,2002,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,51,200.0,2005,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [14]:
# Split our preprocessed data into our features and target arrays
x_cols = [i for i in nn_df.columns if i not in ('Violation Code')]
X, y = nn_df[x_cols], nn_df['Violation Code']

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [15]:
# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [21]:
X_train

Unnamed: 0,Violation Time,Vehicle Year,Registration State_CT,Registration State_FL,Registration State_GA,Registration State_NJ,Registration State_NY,Registration State_OTHER,Registration State_PA,Registration State_TX,...,Vehicle Make_SUBAR,Vehicle Make_TOYOT,Vehicle Color_BLK,Vehicle Color_BLU,Vehicle Color_GRN,Vehicle Color_GRY,Vehicle Color_ORG,Vehicle Color_OTH,Vehicle Color_RED,Vehicle Color_WHT
6064322,658.0,2018,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
6014833,513.0,2019,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
5414894,196.0,2023,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1908965,620.0,2006,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4066673,755.0,2010,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3774774,690.0,2004,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4689960,549.0,2021,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5478295,769.0,2020,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2638294,753.0,2020,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [20]:
# Define the model - deep neural net
number_input_features = len(X_train[0])
hidden_nodes_layer1 = 8
hidden_nodes_layer2 = 5

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))

# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Check the structure of the model
nn.summary()

KeyError: 0