## Stochastic Gradient Descent (SGD) Classifier Testing
In this notebook, test an SGD Classifier from scikit learn for image classification of the sea lion chips.

In [28]:
import cv2
import numpy as np
import pathlib
import pickle
import random

from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix, f1_score
from sklearn.model_selection import cross_val_predict, cross_val_score, train_test_split

##### Retrieve the arrays and labels via pickle. Assign arrays to X (our training features) and labels to y (our training target/labels)

In [5]:
with open('image_arrays.pkl', 'rb') as f:
    X = pickle.load(f)
with open('image_labels.pkl', 'rb') as f:
    y = pickle.load(f)

##### Create train test split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.12,
                                                    random_state=42,
                                                    shuffle=True)

##### Initialize the classifier and fit to the training data

In [11]:
sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X_train, y_train)

SGDClassifier(alpha=0.0001, average=False, class_weight=None,
              early_stopping=False, epsilon=0.1, eta0=0.0, fit_intercept=True,
              l1_ratio=0.15, learning_rate='optimal', loss='hinge',
              max_iter=1000, n_iter_no_change=5, n_jobs=None, penalty='l2',
              power_t=0.5, random_state=42, shuffle=True, tol=0.001,
              validation_fraction=0.1, verbose=0, warm_start=False)

##### Predict on subset of the test data

In [24]:
print(
    "Label codes:  {'adult_females': 0, 'adult_males': 1, 'juveniles': 2, 'pups': 3, 'subadult_males': 4}"
)
print("Predictions: ", sgd_clf.predict(X_test[0:20]))
print("Actual classes: ", y_test[0:20])

Label codes:  {'adult_females': 0, 'adult_males': 1, 'juveniles': 2, 'pups': 3, 'subadult_males': 4}
Predictions:  [3 4 1 3 4 3 4 3 1 0 3 3 3 4 1 4 3 3 4 3]
Actual classes:  [3, 0, 0, 2, 0, 1, 1, 3, 1, 2, 3, 3, 3, 0, 0, 0, 0, 3, 0, 3]


##### Create confusion matrix - good explanation here if unfamiliar: https://www.geeksforgeeks.org/confusion-matrix-machine-learning/

In [17]:
y_train_pred = cross_val_predict(sgd_clf, X_train, y_train, cv=3)

conf_matrix = confusion_matrix(y_train, y_train_pred)
conf_matrix

array([[371,  59,  63,  63,  48],
       [ 42,  10,   7,   4,   4],
       [ 75,  11,  12,  23,   5],
       [ 20,   1,   7, 210,   2],
       [ 28,   2,   5,   8,   1]], dtype=int64)

### Scoring Metrics

##### Mean accuracy

In [22]:
print("Mean accuracy score for SGD Classifier: ", \
      cross_val_score(sgd_clf, X_train, y_train, cv=10, scoring='accuracy').mean())

Mean accuracy score for SGD Classifier:  0.5724084404854992


##### F1 Scores

In [35]:
f1_score(y_train, y_train_pred, average=None)

array([0.65087719, 0.13333333, 0.10909091, 0.76642336, 0.01923077])