In [25]:
# Update sklearn to prevent version mismatches
!pip install sklearn --upgrade

Requirement already up-to-date: sklearn in /Users/beastmode/opt/anaconda3/lib/python3.7/site-packages (0.0)


In [26]:
# install joblib. This will be used to save your model. 
# Restart your kernel after installing 
!pip install joblib



In [1]:
# Set the seed value for the notebook so the results are reproducible
from numpy.random import seed
seed(1)

In [2]:
# Dependencies
import numpy as np
import pandas as pd

In [3]:
import tensorflow
tensorflow.keras.__version__

'2.2.4-tf'

# Read the CSV and Perform Basic Data Cleaning

In [4]:
cdf = pd.read_csv('mach_learn_df.csv')

cdf = cdf.drop(columns=['Unnamed: 0','City'])

cdf.head()

Unnamed: 0,Hour,Offense_Type,Premise,Tract,Month,Day_of_Week,Temperature,Weather
0,0,Burglary/Robbery,Residence or House,313100,1,Monday,46.56,Clouds
1,0,Burglary/Robbery,Apartment,321300,1,Monday,46.56,Clouds
2,0,Burglary/Robbery,"Road, Street, or Sidewalk",432801,1,Monday,46.56,Clouds
3,0,Assault,"Road, Street, or Sidewalk",330700,1,Monday,46.56,Clouds
4,0,Theft,"Church, Synagogue, or Temple Parking Lot",312800,1,Monday,46.56,Clouds


# Select your features (columns)

In [5]:
# Assign X (data) and y (target)
X = cdf.drop(["Offense_Type"], axis=1)
y = cdf["Offense_Type"]
print(X.shape, y.shape)

(309959, 7) (309959,)


In [6]:
from sklearn.preprocessing import LabelEncoder

# Step 1: Label-encode data set
label_encodery = LabelEncoder()
encoded_y = label_encodery.fit_transform(y)

In [7]:
from tensorflow.keras.utils import to_categorical
cat_y = to_categorical(encoded_y)
cat_y

array([[0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.]], dtype=float32)

In [8]:
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)
    
encoded_X = MultiColumnLabelEncoder(columns = ['Day_of_Week', 'Premise', 'Tract', 'Weather']).fit_transform(X)

encoded_X

Unnamed: 0,Hour,Premise,Tract,Month,Day_of_Week,Temperature,Weather
0,0,125,156,1,1,46.56,1
1,0,9,180,1,1,46.56,1
2,0,131,385,1,1,46.56,1
3,0,131,204,1,1,46.56,1
4,0,33,153,1,1,46.56,1
...,...,...,...,...,...,...,...
309954,23,126,491,5,3,74.26,6
309955,23,126,486,5,3,74.26,6
309956,23,18,486,5,3,74.26,6
309957,23,126,361,5,3,74.26,6


# Create a Train Test Split

Use `koi_disposition` for the y values

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(encoded_X, cat_y, random_state=1, stratify=y)

In [10]:
y_test

array([[0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.]], dtype=float32)

In [11]:
X_train.head()

Unnamed: 0,Hour,Premise,Tract,Month,Day_of_Week,Temperature,Weather
103959,22,140,275,12,4,72.07,1
277814,17,126,317,3,5,77.31,1
252929,17,76,228,12,1,56.98,0
292483,14,126,400,4,1,53.78,0
41013,16,115,399,6,6,86.13,4


# Pre-processing

Scale the data using the MinMaxScaler and perform some feature selection

In [12]:
# # Scale your data
# from sklearn.preprocessing import StandardScaler
# X_scaler = StandardScaler().fit(X_train)

In [13]:
# X_train_scaled = X_scaler.transform(X_train)
# X_test_scaled = X_scaler.transform(X_test)


# Train the Model



In [14]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

In [15]:
# Create model and add layers
model = Sequential()
model.add(Dense(units=50, activation='relu', input_dim=7))
model.add(Dense(units=50, activation='relu'))
model.add(Dense(units=9, activation='softmax'))
model.add(Dense(units=50, input_dim=7))
model.add(Dense(units=50))
model.add(Dense(units=9))

In [16]:
# Compile and fit the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

In [26]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_6 (Dense)              (None, 50)                400       
_________________________________________________________________
dense_7 (Dense)              (None, 50)                2550      
_________________________________________________________________
dense_8 (Dense)              (None, 9)                 459       
Total params: 3,409
Trainable params: 3,409
Non-trainable params: 0
_________________________________________________________________


In [20]:
import concurrent.futures
from tensorflow.keras.layers import Dense
from tensorflow.keras.models import Sequential
import tensorflow as tf
import tensorflow.keras.backend as K
from tensorflow.python.client import device_lib

def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']

def fit(gpu):
    with tf.Session(graph=tf.Graph()) as sess:
        K.set_session(sess)
        with tf.device(gpu):
            model = Sequential()
            model.add(Dense(50, input_dim=7, activation='relu'))
            model.add(Dense(50, input_dim=7, activation='relu'))
            model.add(Dense(9, activation='sigmoid'))
            model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
            model.fit(X_train, y_train, epochs=100, verbose=0)

            return model.evaluate(X_train, y_train, verbose=0)

gpus = get_available_gpus()

with concurrent.futures.ThreadPoolExecutor(len(gpus)) as executor:
    results = [x for x in executor.map(fit, gpus)]
print('results: ', results)

# model.fit(
#     X_train,
#     y_train,
#     epochs=60,
#     shuffle=True,
#     verbose=2,
#     n_jobs=-1
# )

ValueError: max_workers must be greater than 0

In [None]:
model_loss, model_accuracy = model.evaluate(
    X_test_scaled, y_test_categorical, verbose=2)
print(
    f"Normal Neural Network - Loss: {model_loss}, Accuracy: {model_accuracy}")

# Make Predictions

In [None]:
encoded_predictions = model.predict_classes(X_test[:5])
prediction_labels = label_encoder.inverse_transform(encoded_predictions)

In [None]:
print(f"Predicted classes: {prediction_labels}")
print(f"Actual Labels: {list(y_test[:5])}")

# Hyperparameter Tuning

Use `GridSearchCV` to tune the model's parameters

In [52]:
import numpy
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

train_x = np.asarray(X_train)
train_y = np.asarray(y_train)

print('working1')

# Function to create model, required for KerasClassifier
def create_model(activation='relu'):
    # create model
    model = Sequential()
    model.add(Dense(12, input_dim=7, kernel_initializer='uniform', activation=activation))
    model.add(Dense(9, kernel_initializer='uniform', activation='sigmoid'))
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    print('working')
    return model

# create model
model = KerasClassifier(build_fn=create_model, verbose=2)

# define the grid search parameters
epochs = [25, 50, 100, 150]
param_grid = dict(batch_size=batch_size, epochs=epochs)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)
grid_result = grid.fit(train_x[:10000], train_y[:10000])

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

working1


KeyboardInterrupt: 

In [38]:
import numpy
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

train_x = np.asarray(X_train)
train_y = np.asarray(y_train)

print('working1')

# Function to create model, required for KerasClassifier
def create_model(activation='relu'):
    # create model
    model = Sequential()
    model.add(Dense(12, input_dim=7, kernel_initializer='uniform', activation=activation))
    model.add(Dense(9, kernel_initializer='uniform', activation='sigmoid'))
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    print('working')
    return model

# create model
model = KerasClassifier(build_fn=create_model, epochs=25, verbose=2)

# define the grid search parameters
activation = ['softmax', 'softplus', 'softsign', 'relu', 'tanh', 'sigmoid', 'hard_sigmoid', 'linear']
param_grid = dict(activation=activation)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)
grid_result = grid.fit(train_x[:10000], train_y[:10000])

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

working1
working
Train on 10000 samples
Epoch 1/25
10000/10000 - 2s - loss: 2.0508 - accuracy: 0.4383
Epoch 2/25
10000/10000 - 1s - loss: 1.8331 - accuracy: 0.4476
Epoch 3/25
10000/10000 - 1s - loss: 1.7064 - accuracy: 0.4476
Epoch 4/25
10000/10000 - 1s - loss: 1.6278 - accuracy: 0.4476
Epoch 5/25
10000/10000 - 1s - loss: 1.5777 - accuracy: 0.4476
Epoch 6/25
10000/10000 - 1s - loss: 1.5448 - accuracy: 0.4476
Epoch 7/25
10000/10000 - 1s - loss: 1.5226 - accuracy: 0.4476
Epoch 8/25
10000/10000 - 1s - loss: 1.5075 - accuracy: 0.4476
Epoch 9/25
10000/10000 - 1s - loss: 1.4973 - accuracy: 0.4476
Epoch 10/25
10000/10000 - 1s - loss: 1.4904 - accuracy: 0.4476
Epoch 11/25
10000/10000 - 1s - loss: 1.4856 - accuracy: 0.4476
Epoch 12/25
10000/10000 - 1s - loss: 1.4823 - accuracy: 0.4476
Epoch 13/25
10000/10000 - 1s - loss: 1.4800 - accuracy: 0.4476
Epoch 14/25
10000/10000 - 1s - loss: 1.4785 - accuracy: 0.4476
Epoch 15/25
10000/10000 - 1s - loss: 1.4773 - accuracy: 0.4476
Epoch 16/25
10000/10000 

AttributeError: 'GridSearchCV' object has no attribute 'best_score'

In [49]:
best_activation = 'relu'
best_activation

'relu'

In [36]:
import numpy
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

train_x = np.asarray(X_train)
train_y = np.asarray(y_train)

print('working1')

# Function to create model, required for KerasClassifier
def create_model(optimizer='adam'):
    # create model
    model = Sequential()
    model.add(Dense(12, input_dim=7, kernel_initializer='uniform', activation='relu'))
    model.add(Dense(9, kernel_initializer='uniform', activation='sigmoid'))
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    print('working')
    return model

# create model
model = KerasClassifier(build_fn=create_model, epochs=50, verbose=2)

# define the grid search parameters
optimizer = ['SGD', 'RMSprop', 'Adagrad', 'Adadelta', 'Adam', 'Adamax', 'Nadam']
param_grid = dict(optimizer=optimizer)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)
grid_result = grid.fit(train_x[:10000], train_y[:10000])

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))
best_optimizer = '%f' % grid_result.best_score_


working1
working
Train on 10000 samples
Epoch 1/50
10000/10000 - 2s - loss: 1.5326 - accuracy: 0.4468
Epoch 2/50
10000/10000 - 1s - loss: 1.5085 - accuracy: 0.4475
Epoch 3/50
10000/10000 - 1s - loss: 1.5076 - accuracy: 0.4476
Epoch 4/50
10000/10000 - 1s - loss: 1.5172 - accuracy: 0.4476
Epoch 5/50
10000/10000 - 1s - loss: 1.5084 - accuracy: 0.4476
Epoch 6/50
10000/10000 - 1s - loss: 1.5008 - accuracy: 0.4476
Epoch 7/50
10000/10000 - 1s - loss: 1.5024 - accuracy: 0.4476
Epoch 8/50
10000/10000 - 1s - loss: 1.5038 - accuracy: 0.4476
Epoch 9/50
10000/10000 - 1s - loss: 1.4991 - accuracy: 0.4476
Epoch 10/50
10000/10000 - 1s - loss: 1.4992 - accuracy: 0.4476
Epoch 11/50
10000/10000 - 1s - loss: 1.5065 - accuracy: 0.4476
Epoch 12/50
10000/10000 - 1s - loss: 1.4960 - accuracy: 0.4476
Epoch 13/50
10000/10000 - 1s - loss: 1.4937 - accuracy: 0.4476
Epoch 14/50
10000/10000 - 1s - loss: 1.4954 - accuracy: 0.4476
Epoch 15/50
10000/10000 - 1s - loss: 1.4947 - accuracy: 0.4473
Epoch 16/50
10000/10000 

In [50]:
best_optimizer = 'adam'

In [None]:
# Function to create model, required for KerasClassifier
def create_model(learn_rate=0.01, momentum=0):
    # create model
    model = Sequential()
    model.add(Dense(12, input_dim=7, kernel_initializer='uniform', activation='relu'))
    model.add(Dense(9, kernel_initializer='uniform', activation='sigmoid'))
    # Compile model
    model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    print('working')
    return model

# create model
model = KerasClassifier(build_fn=create_model, epochs=50, verbose=2)

# define the grid search parameters
optimizer = ['SGD', 'RMSprop', 'Adagrad', 'Adadelta', 'Adam', 'Adamax', 'Nadam']
param_grid = dict(optimizer=optimizer)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)
grid_result = grid.fit(train_x[:10000], train_y[:10000])

# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))


In [27]:
best_neural = 

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV] n_estimators=10 .................................................
[CV] ..................... n_estimators=10, score=0.425, total=   6.8s
[CV] n_estimators=10 .................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    6.8s remaining:    0.0s


[CV] ..................... n_estimators=10, score=0.427, total=   7.5s
[CV] n_estimators=10 .................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   14.4s remaining:    0.0s


[CV] ..................... n_estimators=10, score=0.423, total=   6.4s
[CV] n_estimators=50 .................................................
[CV] ..................... n_estimators=50, score=0.448, total=  38.0s
[CV] n_estimators=50 .................................................
[CV] ..................... n_estimators=50, score=0.446, total=  33.9s
[CV] n_estimators=50 .................................................
[CV] ..................... n_estimators=50, score=0.445, total=  36.3s
[CV] n_estimators=100 ................................................
[CV] .................... n_estimators=100, score=0.450, total= 1.3min
[CV] n_estimators=100 ................................................
[CV] .................... n_estimators=100, score=0.449, total= 1.1min
[CV] n_estimators=100 ................................................
[CV] .................... n_estimators=100, score=0.448, total= 1.1min
[CV] n_estimators=150 ................................................
[CV] .

KeyboardInterrupt: 

{'n_estimators': 150}
0.8958611481975968


# Save the Model

In [47]:
# save your model by updating "your_name" with your name
# and "your_model" with your model variable
# be sure to turn this in to BCS
# if joblib fails to import, try running the command to install in terminal/git-bash
import joblib
filename = 'vikash_bhakta_rf.sav'
joblib.dump(rf, filename)

['vikash_bhakta_rf.sav']