# Hyperparameter Tunning
In this section we will perform hyperparameter tuning on our best chosen models from the model selection notebook.
In here we will select the final features that we will be using for our model.


In [31]:
%matplotlib notebook
import IPython
from IPython.display import display
import pandas as pd
import csv
from numpy import nan as NA
from datetime import datetime
import re
import sys
import numpy as np
import matplotlib.pyplot as plt 
from pandas import *
import pickle
import requests
import os
from sklearn import model_selection
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report
from sklearn.pipeline import Pipeline
from yellowbrick.classifier import ClassificationReport
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC, NuSVC, SVC
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression, SGDClassifier
from sklearn.ensemble import BaggingClassifier, ExtraTreesClassifier, RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import Normalizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.neural_network import MLPClassifier

# Hyperparameter Tunning

## 0.0 Opening Pickled Files

In [32]:
with open('x_y_z.pickle', 'rb') as xyz:
    feat_var = pickle.load(xyz)

In [33]:
X =  feat_var[0]
y =  feat_var[1] #y is "offense group"
z =  feat_var[2] # z is "ucr-rank"

In [25]:
z_i = pd.Series(z)
z_i.value_counts()

6    102462
7     82325
4     26379
5     23033
8     22602
3     17686
2      2170
1       942
9       196
Name: ucr-rank, dtype: int64

In [35]:
X = X[["hour","street", "month", "day", "LATITUDE", "LONGITUDE", "Temperature"]]

## 1. Hyperparameter Tunning with the target variable "offensegroup" y

### 1.1 Defining training, test sets and scaling

In [36]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2,random_state= 0)
scaler = StandardScaler()
#scaler =  MinMaxScaler()
#scaler = Normalizer()
X_train = scaler.fit(X_train).transform(X_train)
X_test = scaler.fit(X_test).transform(X_test)

#Note that there is no scaling in the y set. this is to prevent it to be transformed to continuous.

### 1.2 Defining the tunning function

In [37]:
def gridsearch(model, parameters):
    grid_search = GridSearchCV(estimator = model, param_grid = parameters, scoring = 'accuracy', cv = 4, n_jobs = -1, verbose = 10)
    grid_search = grid_search.fit(X_train, y_train)
    accuracy = grid_search.best_score_
    best_params = grid_search.best_params_
    return accuracy, best_params


### 1.3 Selecting the best models for gridsearch with cv of 4 ( to reduce computation time and cpu temperature)

In [38]:


knn = KNeighborsClassifier()
rnf = RandomForestClassifier()
ann = MLPClassifier()
sgd = SGDClassifier()
baggin = BaggingClassifier()
list_params = [{'n_neighbors':[1, 3, 5, 10, 30, 50]}, {'n_estimators' :[50,128,300, 500, 1000]},
               {'hidden_layer_sizes': [(100,), (200,), (300,)], 'max_iter': [200, 400, 1000]},
              {'alpha': [0.0001, 0.001, 0.1, 1, 10, 50]}, {'n_estimators': [1,10,50,100], 'max_features': [1,3,5],
                                                           'max_samples': [1,20,50,100]}]


models = [knn, rnf, ann, sgd, baggin]

In [39]:
scores_dict = {'accuracy': [], 'best_params': []}

if __name__ == '__main__':

    for model, param in zip(models, list_params):

        acu, best_params = gridsearch(model, param)
        scores_dict['accuracy'].append(acu)
        scores_dict['best_params'].append(best_params)

        print (scores_dict)

    with open('gridsearch_y.pickle', 'wb') as f:
        pickle.dump(scores_dict, f)

Fitting 4 folds for each of 6 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   15.2s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  12 out of  24 | elapsed:  1.5min remaining:  1.5min
[Parallel(n_jobs=-1)]: Done  15 out of  24 | elapsed:  2.9min remaining:  1.7min
[Parallel(n_jobs=-1)]: Done  18 out of  24 | elapsed:  4.2min remaining:  1.4min
[Parallel(n_jobs=-1)]: Done  21 out of  24 | elapsed:  5.5min remaining:   46.8s
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  5.6min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  5.6min finished


{'accuracy': [0.8300275382926259], 'best_params': [{'n_neighbors': 50}]}
Fitting 4 folds for each of 5 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   32.3s
[Parallel(n_jobs=-1)]: Done   8 out of  20 | elapsed:  1.4min remaining:  2.2min
[Parallel(n_jobs=-1)]: Done  11 out of  20 | elapsed:  3.9min remaining:  3.2min
[Parallel(n_jobs=-1)]: Done  14 out of  20 | elapsed:  7.3min remaining:  3.1min
[Parallel(n_jobs=-1)]: Done  17 out of  20 | elapsed: 13.8min remaining:  2.4min
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed: 13.9min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed: 13.9min finished


{'accuracy': [0.8300275382926259, 0.8317059342320776], 'best_params': [{'n_neighbors': 50}, {'n_estimators': 1000}]}
Fitting 4 folds for each of 9 candidates, totalling 36 fits


[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   32.0s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:  2.6min
[Parallel(n_jobs=-1)]: Done  25 out of  36 | elapsed:  4.2min remaining:  1.8min
[Parallel(n_jobs=-1)]: Done  29 out of  36 | elapsed:  5.4min remaining:  1.3min
[Parallel(n_jobs=-1)]: Done  33 out of  36 | elapsed:  6.2min remaining:   33.8s
[Parallel(n_jobs=-1)]: Done  36 out of  36 | elapsed:  6.8min finished


{'accuracy': [0.8300275382926259, 0.8317059342320776, 0.8306529995140302], 'best_params': [{'n_neighbors': 50}, {'n_estimators': 1000}, {'hidden_layer_sizes': (200,), 'max_iter': 200}]}
Fitting 4 folds for each of 6 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done  12 out of  24 | elapsed:    3.5s remaining:    3.5s
[Parallel(n_jobs=-1)]: Done  15 out of  24 | elapsed:    3.6s remaining:    2.1s
[Parallel(n_jobs=-1)]: Done  18 out of  24 | elapsed:    4.0s remaining:    1.3s
[Parallel(n_jobs=-1)]: Done  21 out of  24 | elapsed:    4.1s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:    4.3s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:    4.3s finished


{'accuracy': [0.8300275382926259, 0.8317059342320776, 0.8306529995140302, 0.8300185388505913], 'best_params': [{'n_neighbors': 50}, {'n_estimators': 1000}, {'hidden_layer_sizes': (200,), 'max_iter': 200}, {'alpha': 0.0001}]}
Fitting 4 folds for each of 48 candidates, totalling 192 fits


[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:    5.6s
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    6.5s
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   10.6s
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:   12.3s
[Parallel(n_jobs=-1)]: Done  45 tasks      | elapsed:   17.1s
[Parallel(n_jobs=-1)]: Done  56 tasks      | elapsed:   22.3s
[Parallel(n_jobs=-1)]: Done  69 tasks      | elapsed:   29.1s
[Parallel(n_jobs=-1)]: Done  82 tasks      | elapsed:   31.8s
[Parallel(n_jobs=-1)]: Done  97 tasks      | elapsed:   40.5s
[Parallel(n_jobs=-1)]: Done 112 tasks      | elapsed:   48.1s
[Parallel(n_jobs=-1)]: Done 129 tasks      | elapsed:   57.6s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 165 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 192 out of 192 | elapsed:  1.7min finished


{'accuracy': [0.8300275382926259, 0.8317059342320776, 0.8306529995140302, 0.8300185388505913, 0.8300500368977124], 'best_params': [{'n_neighbors': 50}, {'n_estimators': 1000}, {'hidden_layer_sizes': (200,), 'max_iter': 200}, {'alpha': 0.0001}, {'max_features': 5, 'max_samples': 50, 'n_estimators': 100}]}


## 2. Hyperparameter Tunning with the target variable "ucrrank" z with 3 classes

## run 0.0 again

In [47]:
X =  feat_var[0]
y =  feat_var[1] #y is "offense group"
z =  feat_var[2] # z is "ucr-rank"

In [48]:
X = X[["hour","street", "month", "day", "LATITUDE", "LONGITUDE", "Temperature"]]

In [49]:
z_i = pd.Series(z)
z_i.value_counts()

6    102462
7     82325
4     26379
5     23033
8     22602
3     17686
2      2170
1       942
9       196
Name: ucr-rank, dtype: int64

### 1.1 Defining training, test sets and scaling

In [50]:
def new_cat(z):
    
    new_cat = []
    for i in z:
        if i >= 1 and i < 5:
            i = 3
        elif i == 7:
            i = 2
        elif i == 8:
            i = 2
        else:
            i = 1
        new_cat.append(i)
    z = pd.Series(new_cat)
    return z

In [51]:
z = new_cat(z)

In [52]:
zi = pd.Series(z)

In [53]:
zi.value_counts()

1    125691
2    104927
3     47177
dtype: int64

In [54]:
X_train, X_test, z_train, z_test = model_selection.train_test_split(X, z, test_size=0.2,random_state= 0)

scaler = StandardScaler()
#scaler =  MinMaxScaler()
#scaler = Normalizer()
X_train = scaler.fit(X_train).transform(X_train)
X_test = scaler.fit(X_test).transform(X_test)

In [55]:
def gridsearch(model, parameters):
    grid_search = GridSearchCV(estimator = model, param_grid = parameters, scoring = 'accuracy', cv = 4, n_jobs = -1, verbose = 10)
    grid_search = grid_search.fit(X_train, z_train)
    accuracy = grid_search.best_score_
    best_params = grid_search.best_params_
    return accuracy, best_params


In [None]:
scores_dict2 = {'accuracy': [], 'best_params': []}

if __name__ == '__main__':

    for model, param in zip(models, list_params):

        acu, best_params = gridsearch(model, param)
        scores_dict2['accuracy'].append(acu)
        scores_dict2['best_params'].append(best_params)

        print (scores_dict2)

    with open('gridsearch_z3.pickle', 'wb') as g:
            pickle.dump(scores_dict2, g)

Fitting 4 folds for each of 6 candidates, totalling 24 fits


[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   14.9s
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done  12 out of  24 | elapsed:  1.4min remaining:  1.4min
[Parallel(n_jobs=-1)]: Done  15 out of  24 | elapsed:  2.5min remaining:  1.5min
[Parallel(n_jobs=-1)]: Done  18 out of  24 | elapsed:  3.9min remaining:  1.3min
[Parallel(n_jobs=-1)]: Done  21 out of  24 | elapsed:  5.1min remaining:   43.8s
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  5.2min remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  24 out of  24 | elapsed:  5.2min finished


{'accuracy': [0.5098678881909322], 'best_params': [{'n_neighbors': 50}]}
Fitting 4 folds for each of 5 candidates, totalling 20 fits


[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:   35.2s
[Parallel(n_jobs=-1)]: Done   8 out of  20 | elapsed:  1.6min remaining:  2.4min
Exception in thread Thread-42:
Traceback (most recent call last):
  File "c:\users\franc\appdata\local\programs\python\python36\lib\threading.py", line 916, in _bootstrap_inner
    self.run()
  File "c:\users\franc\appdata\local\programs\python\python36\lib\threading.py", line 864, in run
    self._target(*self._args, **self._kwargs)
  File "c:\users\franc\appdata\local\programs\python\python36\lib\multiprocessing\pool.py", line 463, in _handle_results
    task = get()
  File "c:\users\franc\appdata\local\programs\python\python36\lib\multiprocessing\connection.py", line 250, in recv
    buf = self._recv_bytes()
  File "c:\users\franc\appdata\local\programs\python\python36\lib\multiprocessing\connection.py", line 318, in _recv_bytes
    return self._get_more_data(ov, maxsize)
  File "c:\users\franc\appdata\local\programs\python\python36\lib\multi

## 3. Hyperparameter Tunning with the target variable "offensegroup" z with 2 classes(car theft and motor vehicle theft into violent)

This Variable Will not be used

## run 0.0 again:

In [None]:
X =  feat_var[0]
y =  feat_var[1] #y is "offense group"
z =  feat_var[2] # z is "ucr-rank"

In [None]:
z_i = pd.Series(z)
z_i.value_counts()

In [None]:
def new_cat2(z):
    new_cat = []
    for i in z:
        if i >= 1 and i < 5:
            i = 2
        elif i == 7:
            i = 2
        elif i == 8:
            i = 2
        else:
            i = 1
        new_cat.append(i)
    z = pd.Series(new_cat)
    return z

In [None]:
z = new_cat2(z)

In [None]:
zi = pd.Series(z)

In [None]:
X_train, X_test, z_train, z_test = model_selection.train_test_split(X, z, test_size=0.2,random_state= 0)

scaler = StandardScaler()
#scaler =  MinMaxScaler()
#scaler = Normalizer()
X_train = scaler.fit(X_train).transform(X_train)
X_test = scaler.fit(X_test).transform(X_test)

In [None]:
def gridsearch(model, parameters):
    grid_search = GridSearchCV(estimator = model, param_grid = parameters, scoring = 'accuracy', cv = 4, n_jobs = -1, verbose = 10)
    grid_search = grid_search.fit(X_train, z_train)
    accuracy = grid_search.best_score_
    best_params = grid_search.best_params_
    return accuracy, best_params

In [None]:
scores_dict3 = {'accuracy': [], 'best_params': []}

if __name__ == '__main__':

    for model, param in zip(models, list_params):

        acu, best_params = gridsearch(model, param)
        scores_dict3['accuracy'].append(acu)
        scores_dict3['best_params'].append(best_params)

        print (scores_dict3)

    with open('gridsearch_z2.pickle', 'wb') as h:
            pickle.dump(scores_dict3, h)