## Stochastic Gradient Descent (SGD) Classifier Testing
In this notebook, test an SGD Classifier from scikit learn for image classification of the sea lion chips.

In [1]:
import cv2
import numpy as np
import pathlib
import pickle
import random

from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.model_selection import cross_val_predict, cross_val_score, train_test_split

##### Retrieve the arrays and labels via pickle. Assign arrays to X (our training features) and labels to y (our training target/labels)

In [2]:
with open('image_arrays.pkl', 'rb') as f:
    X = pickle.load(f)
with open('image_labels.pkl', 'rb') as f:
    y = pickle.load(f)

##### Create train test split

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.12,
                                                    random_state=42,
                                                    shuffle=True)

##### Initialize the classifier and fit to the training data

In [4]:
sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X_train, y_train)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=42, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

##### Predict on subset of the test data

In [5]:
print(
    "Label codes:  {'adult_females': 0, 'adult_males': 1, 'juveniles': 2, 'pups': 3, 'subadult_males': 4}"
)
print("Predictions: ", sgd_clf.predict(X_test[0:20]))
print("Actual classes: ", y_test[0:20])

Label codes:  {'adult_females': 0, 'adult_males': 1, 'juveniles': 2, 'pups': 3, 'subadult_males': 4}
Predictions:  [2 3 0 4 3 0 0 3 2 0 0 3 0 0 0 0 3 0 3 0]
Actual classes:  [2, 3, 1, 0, 3, 0, 0, 3, 3, 3, 1, 2, 3, 0, 0, 2, 3, 0, 3, 4]


##### Create confusion matrix - good explanation here if unfamiliar: https://www.geeksforgeeks.org/confusion-matrix-machine-learning/

In [6]:
y_train_pred = cross_val_predict(sgd_clf, X_train, y_train, cv=3)

conf_matrix = confusion_matrix(y_train, y_train_pred)
conf_matrix

array([[1095,  136,  282,  182,   38],
       [ 156,   23,   34,   25,    4],
       [ 328,   37,  168,   96,    8],
       [ 175,   14,  128,  404,    4],
       [ 141,   12,   26,   19,    1]], dtype=int64)

### Scoring Metrics

##### Mean accuracy

In [7]:
print("Mean accuracy score for SGD Classifier: ", \
      cross_val_score(sgd_clf, X_train, y_train, cv=10, scoring='accuracy').mean())

Mean accuracy score for SGD Classifier:  0.4793098314765847


##### F1 Scores

In [8]:
f1_score(y_train, y_train_pred, average=None)

array([0.60363837, 0.09913793, 0.26352941, 0.55685734, 0.00787402])