# Classifier - Feature Engineering / Model Selection 

In [1]:
import matplotlib.image as mpimg
import matplotlib.pyplot as plt
import numpy as np
import cv2
import glob
import pickle
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from skimage.feature import hog
from sklearn.model_selection import train_test_split

from skimage.feature import hog

## Feature Engineering 

In [2]:
# Define a function to return HOG features and visualization
def get_hog_features(img, orient, pix_per_cell, cell_per_block, 
                        vis=False, feature_vec=True):
    # Call with two outputs if vis==True
    if vis == True:
        features, hog_image = hog(img, orientations=orient, 
                                  pixels_per_cell=(pix_per_cell, pix_per_cell),
                                  cells_per_block=(cell_per_block, cell_per_block), 
                                  transform_sqrt=True, 
                                  visualise=vis, feature_vector=feature_vec)
        return features, hog_image
    # Otherwise call with one output
    else:      
        features = hog(img, orientations=orient, 
                       pixels_per_cell=(pix_per_cell, pix_per_cell),
                       cells_per_block=(cell_per_block, cell_per_block), 
                       transform_sqrt=True, 
                       visualise=vis, feature_vector=feature_vec)
        return features

# Define a function to compute binned color features  
def bin_spatial(img, size=(32, 32)):
    # Use cv2.resize().ravel() to create the feature vector
    features = cv2.resize(img, size).ravel() 
    # Return the feature vector
    return features

# Define a function to compute color histogram features 
# NEED TO CHANGE bins_range if reading .png files with mpimg!
def color_hist(img, nbins=32, bins_range=(0, 256)):
    # Compute the histogram of the color channels separately
    channel1_hist = np.histogram(img[:,:,0], bins=nbins, range=bins_range)
    channel2_hist = np.histogram(img[:,:,1], bins=nbins, range=bins_range)
    channel3_hist = np.histogram(img[:,:,2], bins=nbins, range=bins_range)
    # Concatenate the histograms into a single feature vector
    hist_features = np.concatenate((channel1_hist[0], channel2_hist[0], channel3_hist[0]))
    # Return the individual histograms, bin_centers and feature vector
    return hist_features

# Define a function to extract features from a list of images
# Have this function call bin_spatial() and color_hist()
def extract_features(imgs, color_space='RGB', spatial_size=(32, 32),
                        hist_bins=32, orient=9, 
                        pix_per_cell=8, cell_per_block=2, hog_channel=0,
                        spatial_feat=True, hist_feat=True, hog_feat=True):
    # Create a list to append feature vectors to
    features = []
    # Iterate through the list of images
    for file in imgs:
        file_features = []
        # Read in each one by one
        image = mpimg.imread(file)              # PNG: 0-1, JPG: 0-255
        image = np.uint8(image * 255)           # Scale training images in PNG from 0-1 to 0-255     
        
        # apply color conversion if other than 'RGB'
        if color_space != 'RGB':
            if color_space == 'HSV':
                feature_image = cv2.cvtColor(image, cv2.COLOR_RGB2HSV)
            elif color_space == 'LUV':
                feature_image = cv2.cvtColor(image, cv2.COLOR_RGB2LUV)
            elif color_space == 'HLS':
                feature_image = cv2.cvtColor(image, cv2.COLOR_RGB2HLS)
            elif color_space == 'YUV':
                feature_image = cv2.cvtColor(image, cv2.COLOR_RGB2YUV)
            elif color_space == 'YCrCb':
                feature_image = cv2.cvtColor(image, cv2.COLOR_RGB2YCrCb)
        else: feature_image = np.copy(image)      

        if spatial_feat == True:
            spatial_features = bin_spatial(feature_image, size=spatial_size)
            file_features.append(spatial_features)
        if hist_feat == True:
            # Apply color_hist()
            hist_features = color_hist(feature_image, nbins=hist_bins)
            file_features.append(hist_features)
        if hog_feat == True:
        # Call get_hog_features() with vis=False, feature_vec=True
            if hog_channel == 'ALL':
                hog_features = []
                for channel in range(feature_image.shape[2]):
                    hog_features.append(get_hog_features(feature_image[:,:,channel], 
                                        orient, pix_per_cell, cell_per_block, 
                                        vis=False, feature_vec=True))
                hog_features = np.ravel(hog_features)        
            else:
                hog_features = get_hog_features(feature_image[:,:,hog_channel], orient, 
                            pix_per_cell, cell_per_block, vis=False, feature_vec=True)
            # Append the new feature vector to the features list
            file_features.append(hog_features)
        features.append(np.concatenate(file_features))
    # Return list of feature vectors
    return features

In [3]:
cars = glob.iglob('vehicles/vehicles/**/*.png')
notcars = glob.iglob('non-vehicles/non-vehicles/**/*.png')

In [4]:
color_space = 'YUV' # Can be RGB, HSV, LUV, HLS, YUV, YCrCb
orient = 11  # HOG orientations
pix_per_cell = 16 # HOG pixels per cell
cell_per_block = 2 # HOG cells per block
hog_channel = "ALL" # Can be 0, 1, 2, or "ALL"
spatial_size = (16, 16) # Spatial binning dimensions
hist_bins = 16    # Number of histogram bins
spatial_feat = True # Spatial features on or off
hist_feat = True # Histogram features on or off
hog_feat = True # HOG features on or off

car_features = extract_features(cars, color_space=color_space, 
                        spatial_size=spatial_size, hist_bins=hist_bins, 
                        orient=orient, pix_per_cell=pix_per_cell, 
                        cell_per_block=cell_per_block, 
                        hog_channel=hog_channel, spatial_feat=spatial_feat, 
                        hist_feat=hist_feat, hog_feat=hog_feat)

notcar_features = extract_features(notcars, color_space=color_space, 
                        spatial_size=spatial_size, hist_bins=hist_bins, 
                        orient=orient, pix_per_cell=pix_per_cell, 
                        cell_per_block=cell_per_block, 
                        hog_channel=hog_channel, spatial_feat=spatial_feat, 
                        hist_feat=hist_feat, hog_feat=hog_feat)

C:\Users\gtesei\AppData\Local\Continuum\Miniconda3\envs\carnd-term1\lib\site-packages\skimage\feature\_hog.py:119: skimage_deprecation: Default value of `block_norm`==`L1` is deprecated and will be changed to `L2-Hys` in v0.15
  'be changed to `L2-Hys` in v0.15', skimage_deprecation)


In [5]:
# Create an array stack of feature vectors
X = np.vstack((car_features, notcar_features)).astype(np.float64)                        

# Define the labels vector
y = np.hstack((np.ones(len(car_features)), np.zeros(len(notcar_features))))

In [6]:

print("*** car_features:",len(car_features))
print("*** notcar_features:",len(notcar_features))

print("*** X:",X.shape)
print("*** y:",y.shape)


*** car_features: 8792
*** notcar_features: 8968
*** X: (17760, 2004)
*** y: (17760,)


**Dataset seems balanced**

## Model Selection 

In [7]:
X_scaler = StandardScaler().fit(X)
# Apply the scaler to X
scaled_X = X_scaler.transform(X)

X_train, X_test, y_train, y_test = train_test_split(scaled_X, y, test_size=0.2,  stratify=y, random_state=1234)

In [8]:
from sklearn.pipeline import Pipeline
clf = LinearSVC(loss='hinge')

clf.fit(X_train, y_train)
print('Test Accuracy of classifier = ', round(clf.score(X_test, y_test), 4))

print("Predictions - first 15 labels: ", clf.predict(X_test[0:15]))
print("Ground truth - first  15 labels: ", y_test[0:15])

Test Accuracy of classifier =  0.9907
Predictions - first 15 labels:  [ 1.  0.  1.  0.  1.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.]
Ground truth - first  15 labels:  [ 1.  0.  1.  0.  1.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.]


### Grid 

In [9]:
import sys
import numpy as np
import os
import pandas as pd
from sklearn import preprocessing


from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier

from sklearn.metrics import accuracy_score

import sklearn.linear_model as lm
from sklearn.model_selection import GridSearchCV

import matplotlib.pyplot as plt

from sklearn.metrics import roc_auc_score, log_loss
from numpy import linalg as LA
from sklearn import neighbors


# grid
columns = ['Predictive_Model',
               'Parameters',
               'Accuracy_Mean',
               'Accuracy_STD',
               'Predict_Time']
perf_panel = pd.DataFrame(data=np.zeros((1, len(columns))), columns=columns)

# models
models = ['LinearSVC','LogisticRegression','RandomForest','SVC','KNeighborsClassifier']
parameters = {
    'LinearSVC': {"loss": ['hinge','squared_hinge']},
    'RandomForest': {"n_estimators": [100, 1000],
                     "max_depth": [3, 1, None],
                     "criterion": ["gini", "entropy"]},
    'SVC': {'kernel': ['linear', 'rbf', 'poly'], 'C': [0.1, 1, 5, 10, 50]},
    'LogisticRegression': {'C': [0.001, 0.01, 0.1, 1, 10, 100]},
    'MultinomialNB': {'alpha': [0, 0.0005, 0.1, 0.2, 0.3, 0.4, 0.5, 0.8, 0.9, 1]},
    'KNeighborsClassifier': {'n_neighbors': [5, 10, 20, 50], 'weights': ['uniform', 'distance']},
    'MLPClassifier': {'hidden_layer_sizes': [(1000, 50),(2000,100),(3000,200),(3000,1000,100)]}
}

# options
seeds = [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]

In [10]:
# proc
for model in models:
    if model == 'RandomForest':
        clf = RandomForestClassifier(n_estimators = 1000)
    elif model == 'SVC':
        clf = SVC(kernel='linear',C=10)
    elif model == 'LogisticRegression':
        clf = lm.LogisticRegression(C=1e5)
    elif model == 'MultinomialNB':
        clf =  MultinomialNB(alpha=0.0005)
    elif model == 'KNeighborsClassifier':
        clf = neighbors.KNeighborsClassifier(10, weights='distance')
    elif model == 'MLPClassifier':
        clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(1000, 50))
    elif model == 'LinearSVC':
        clf = LinearSVC(loss='hinge')
    else:
        raise ValueError('unkwown model: ' + model)
    print("---> model:",model)
    #
    
    grid_clf = GridSearchCV(estimator=clf, param_grid=parameters[model],
                                cv=4, scoring='accuracy', n_jobs=10)
    np.random.seed(1234)
    perm = np.random.permutation(scaled_X.shape[0])
    XX = scaled_X[perm]
    YY = y[perm]
    grid_clf.fit(XX, YY)
    
    # report 
    mean_test_score = grid_clf.cv_results_['mean_test_score'][grid_clf.best_index_]
    std_test_score = grid_clf.cv_results_['std_test_score'][grid_clf.best_index_]

    mean_score_time = grid_clf.cv_results_['mean_score_time'][grid_clf.best_index_]
    std_score_time = grid_clf.cv_results_['std_score_time'][grid_clf.best_index_]
    
    best_params = str(grid_clf.best_params_)

    print("mean_test_score:",mean_test_score)
    print("std_test_score:",std_test_score)

    print("mean_score_time:",mean_score_time)
    print("std_score_time:",std_score_time)
    sys.stdout.flush()
    
    # grid update 
    perf_panel = perf_panel.append(pd.DataFrame(np.array([[model, best_params,mean_test_score, std_test_score, mean_score_time]]), columns=columns))
    perf_panel.to_csv('./perf_panel.csv')
    print()

---> model: LinearSVC
mean_test_score: 0.992286036036
std_test_score: 0.000682677683163
mean_score_time: 0.676751375198
std_score_time: 0.18354603614

---> model: LogisticRegression
mean_test_score: 0.993918918919
std_test_score: 0.000551686879005
mean_score_time: 0.345002532005
std_score_time: 0.245167709017

---> model: RandomForest
mean_test_score: 0.994707207207
std_test_score: 0.00176266619848
mean_score_time: 4.02918541431
std_score_time: 0.770155361205

---> model: SVC
mean_test_score: 0.997691441441
std_test_score: 0.000537127928726
mean_score_time: 71.2303560972
std_score_time: 0.795172666181

---> model: KNeighborsClassifier
mean_test_score: 0.995720720721
std_test_score: 0.000827530318508
mean_score_time: 749.925899804
std_score_time: 2.97953834374



## Results 

In [11]:
perf = pd.read_csv('./perf_panel.csv')
perf = perf.sort_values(['Accuracy_Mean', 'Predict_Time'], ascending=[1, 0])
perf

Unnamed: 0,Predictive_Model,Parameters,Accuracy_Mean,Accuracy_STD,Predict_Time
0,LinearSVC,{'loss': 'hinge'},0.992286,0.000683,0.29375
1,LogisticRegression,{'C': 0.1},0.993919,0.000552,0.316752
2,RandomForest,"{'criterion': 'gini', 'max_depth': None, 'n_es...",0.994707,0.001784,4.457001
4,KNeighborsClassifier,"{'weights': 'distance', 'n_neighbors': 5}",0.995721,0.000828,698.066331
3,SVC,"{'kernel': 'rbf', 'C': 5}",0.997691,0.000537,67.464008



The most fast model in computing score on new images is LinearSVC and the difference in terms of acccuracy is negligible.  

Let's retrain this model on the whole dataset and let's serialize it. 

In [12]:
# train 
clf = Pipeline([('scaling', StandardScaler()),
                ('classification', LinearSVC(loss='hinge'))])
clf.fit(X, y)

# serialize
from sklearn.externals import joblib
config = dict(color_space=color_space, 
            spatial_size=spatial_size, hist_bins=hist_bins, 
            orient=orient, pix_per_cell=pix_per_cell, 
            cell_per_block=cell_per_block, 
            hog_channel=hog_channel, spatial_feat=spatial_feat, 
            hist_feat=hist_feat, hog_feat=hog_feat)
joblib.dump({'model':clf, 'config':config}, 'classifier.p')

['classifier.p']