## Machine Learning with Scikit-Learn

Scikit-Learn has simple and intuitive API to start learning and implementing traditional ML algorithms.

Here we will use two classification methods to create a model to predict the category of a new gear image, training on the preprocessed 128x128x3 gear data from the previous step. The two classification models will be evaluated and their accuracy will be compared.

In [1]:
import os
import pandas as pd
import numpy as np
from PIL import Image
from scipy.stats import randint

from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

Label encoder function to have the predicted labels as numbers

In [2]:
def label_encoder(key):
    label_mapping = {
        "axes" : "1",
        "boots" : "2",
        "carabiners" : "3",
        "crampons" : "4",
        "gloves" : "5",
        "hardshell_jackets" : "6",
        "harnesses" : "7",
        "helmets" : "8",
        "insulated_jackets" : "9",
        "pulleys" : "10",
        "rope" : "11",
        "tents" : "12",
    }
    return int(label_mapping[key])

#### Generate Dataframe
Create a dataframe containing the pixel image array and associate category for each image

In [3]:
rootDir = 'gear_images/'
directories = ['axes', 'boots', 'carabiners', 'crampons', 'gloves', 'hardshell_jackets', 'harnesses',
              'helmets', 'insulated_jackets', 'pulleys', 'rope', 'tents']


df = pd.DataFrame()
category = [] # List of labels (numbered [1,12])
pixel_array = [] # List of Flatten Pixel array for each image

for directory in directories:   
    folderPath = rootDir + '/' + directory + '/'
    print('Folder: {}'.format(folderPath))
    for fname in os.listdir(folderPath):
        if fname.endswith('resized_equalized.jpeg'):
            im = Image.open(folderPath + fname)
            im_array = np.array(im, dtype=float).flatten()
            
            # Append data to list
            category.append(label_encoder(directory))
            pixel_array.append(im_array)

pd_dict = {
    'pixel_array' : pixel_array,
    'category' : category,
}
df = pd.DataFrame(pd_dict)
df['category'] = pd.to_numeric(df['category'])
df = shuffle(df)

Folder: gear_images//axes/
Folder: gear_images//boots/
Folder: gear_images//carabiners/
Folder: gear_images//crampons/
Folder: gear_images//gloves/
Folder: gear_images//hardshell_jackets/
Folder: gear_images//harnesses/
Folder: gear_images//helmets/
Folder: gear_images//insulated_jackets/
Folder: gear_images//pulleys/
Folder: gear_images//rope/
Folder: gear_images//tents/


In [4]:
print(df.shape)
df.info()
df.head()

(2063, 2)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2063 entries, 53 to 821
Data columns (total 2 columns):
pixel_array    2063 non-null object
category       2063 non-null int64
dtypes: int64(1), object(1)
memory usage: 48.4+ KB


Unnamed: 0,pixel_array,category
53,"[255.0, 255.0, 255.0, 255.0, 255.0, 255.0, 255...",1
250,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",3
379,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",3
330,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",3
1153,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",6


#### Spit into Training and Testing Datasets

In [5]:
# Splitting Training and Testing Data
X_train, X_test, y_train, y_test = train_test_split(df['pixel_array'],
                                                    df['category'],
                                                    test_size = 0.2,
                                                    random_state=42,
                                                    stratify=df['category'])
X_train = X_train.tolist()
X_test = X_test.tolist()

### Decision Tree - Hyperparameter Tuning

In [6]:
# Setup the parameters and distributions to sample from: param_dist
param_dist = {"max_depth": [3, 6, 9, None],
              "max_features": randint(6, 9),
              "min_samples_leaf": randint(1, 3),
              "criterion": ["gini", "entropy"]}

# Instantiate Decision Tree Model
dtree_model = DecisionTreeClassifier()

# Instantiate the RandomizedSearchCV object
tree_cv = RandomizedSearchCV(dtree_model, param_dist, cv=4)

# Fit to the data
tree_cv.fit(X_train, y_train)

# Print the tuned parameters and best score
print("Tuned Decision Tree Parameters: {}\n".format(tree_cv.best_params_))
print("Tuned Decision Tree Model Best score is {}".format(tree_cv.best_score_))



Tuned Decision Tree Parameters: {'criterion': 'entropy', 'max_depth': None, 'max_features': 6, 'min_samples_leaf': 1}

Tuned Decision Tree Model Best score is 0.6284848484848485


### Decision Tree - Confusion Matrix, Classification Report, ROC Curve

In [7]:
# Instantiate a Decision Tree Classifier
dtree_model = DecisionTreeClassifier(max_depth=None,
                                     max_features=6,
                                     min_samples_leaf=1,
                                     criterion='entropy')

# Fit to the data
dtree_model.fit(X_train, y_train)

# Predicting labels from testing data
y_pred_dtree = dtree_model.predict(X_test)

# Confusion matrix
cm = confusion_matrix(y_test, y_pred_dtree)
print('\nConfusion Matrix : \n', cm)

# Classification report
report = classification_report(y_test, y_pred_dtree)
print("\nClassification Report : \n", report)

# Model accuracy computed with 5-fold cross-validation scores
cv_scores_tree = cross_val_score(dtree_model, X_test, y_test, cv=10)
print('\nDecision Tree Model Accuracy : \n', np.mean(cv_scores_tree))


Confusion Matrix : 
 [[ 7  0  4  2  2  0  0  0  0  0  0  1]
 [ 0 20  0  1  1  0  0  0  0  0  1  0]
 [ 2  0 35  3  4  0  5  0  1  1  1  0]
 [ 2  2  1 10  1  0  2  1  0  0  3  2]
 [ 0  1  1  3 14  4  4  2  1  0  5  0]
 [ 1  0  1  0  4 58  1  0 22  2  2  1]
 [ 0  1  2  4  1  0 23  1  1  1  0  2]
 [ 0  0  2  0  0  0  1 13  1  1  0  0]
 [ 0  0  0  0  0 21  1  0 20  1  3  1]
 [ 0  0  0  0  1  1  2  1  1  2  0  0]
 [ 0  0  2  2  3  0  1  0  1  1 28  1]
 [ 0  0  1  5  0  0  0  1  0  0  3 13]]

Classification Report : 
               precision    recall  f1-score   support

           1       0.58      0.44      0.50        16
           2       0.83      0.87      0.85        23
           3       0.71      0.67      0.69        52
           4       0.33      0.42      0.37        24
           5       0.45      0.40      0.42        35
           6       0.69      0.63      0.66        92
           7       0.57      0.64      0.61        36
           8       0.68      0.72      0.70      




Decision Tree Model Accuracy : 
 0.5456681542430346


### SVM - Hyperparameter Tuning

In [None]:
# Specify the hyperparameter space
parameters = {'C':[1, 10, 100],
              'kernel':['linear', 'rbf'],
              'gamma':[0.00001, 0.0001, 0.001, 0.01, 0.1]}

# Instantiate an SVM classifier
svm_model = SVC()

# Instantiate the RandomizedSearchCV object
searcher = RandomizedSearchCV(svm_model, parameters, cv=5)

# Fit to the training set
searcher.fit(X_train, y_train)

# Print the tuned parameters and best score
print("Tuned Model Parameters: {}".format(searcher.best_params_))
print("Tuned Model Score: {}".format(searcher.best_score_))



### SVM - Confusion Matrix, Classification Report, ROC Curve

In [8]:
# Instantiate an SVM classifier
svm_model_linear = SVC(kernel = 'linear', C = 1)

# Fit to the data
svm_model_linear.fit(X_train, y_train)

# Predicting labels from testing data
y_pred_svm = svm_model_linear.predict(X_test)

# Confusion matrix
cm = confusion_matrix(y_test, y_pred_svm)

# Model accuracy computed with 5-fold cross-validation scores
cv_scores_svm = cross_val_score(svm_model_linear, X_test, y_test, cv=5)
print('\nSVM Model Accuracy : ', np.mean(cv_scores_svm))


SVM Model Accuracy :  0.8068695527688403
