# Data Science Section - Phase 3
# Author: Peter Lee

1. Loading dataset

In [8]:
import pickle
import numpy as np
import pandas as pd

def unpickle(file):
    with open(file, 'rb') as fo:
        dict = pickle.load(fo, encoding='bytes')
    return dict

batch1 = unpickle('./data/data_batch_1')
batch2 = unpickle('./data/data_batch_2')
batch3 = unpickle('./data/data_batch_3')
batch4 = unpickle('./data/data_batch_4')
batch5 = unpickle('./data/data_batch_5')
test = unpickle('./data/test_batch')

2. Exploratory Data Analaysis

In [9]:
# Each batch contains 10000 images with each image storing a 32x32 colour image. The first 1024 contains the red channel values, the next 1024 contains the green, and the final contains the blue.
# Therefore each batch should have a shape of 10000x3072. I will only take the first 2000 images from each batch to have a total of 10000 images for the training batch. Since the images are ordered randomly for each batch
# there should not be any large imbalances between the classes.

train1_x = batch1[b'data'][:2000]
# train1_x = train1_x[:2000]
print(train1_x.shape)
train1_y = batch1[b'labels'][:2000]

train2_x = batch2[b'data'][:2000]
print(train2_x.shape)
train2_y = batch2[b'labels'][:2000]

train3_x = batch3[b'data'][:2000]
print(train3_x.shape)
train3_y = batch3[b'labels'][:2000]

train4_x = batch4[b'data'][:2000]
print(train4_x.shape)
train4_y = batch4[b'labels'][:2000]

train5_x = batch5[b'data'][:2000]
print(train5_x.shape)
train5_y = batch5[b'labels'][:2000]

# Combining all the batches into one list. Since each array will be concatenated in order, the indeces for labels with the corresponding images will stay the same.

train_x = np.concatenate((train1_x, train2_x, train3_x, train4_x, train5_x))
train_y = np.concatenate((train1_y, train2_y, train3_y, train4_y, train5_y))

test_x = test[b'data'][:2000]
print(test_x.shape)
test_y = test[b'labels'][:2000]

# Now we have all the required data and labels to be able to train.

(2000, 3072)
(2000, 3072)
(2000, 3072)
(2000, 3072)
(2000, 3072)
(2000, 3072)


In [10]:
print(train_x.shape)
print(train_y.shape)
print(train_x[2])

(10000, 3072)
(10000,)
[255 253 253 ...  83  83  84]


In [23]:
from PIL import Image

#Producing the first 5 images from the training batch.

for numImg in range(5):
    w, h = 32, 32
    data = np.zeros((h, w, 3), dtype=np.uint8)
    for i in range(32):
        for j in range(32):
            data[i][j] = [train_x[numImg][i * 32 + j], train_x[numImg][i * 32 + j + 1024], train_x[numImg][i * 32 + j + 2048]]

    img = Image.fromarray(data, 'RGB')
    img.save(f"img{numImg}.png")
    img.show()

The label I will be choosing is "ship" which is a value of 8.

# 3. Data Processing

In [12]:
filter_train_x = []
filter_train_y = []
filter_test_x = []
filter_test_y = []

for i in range(len(train_y)):
    if(train_y[i] == 8):
        filter_train_x.append(train_x[i])
        filter_train_y.append(train_y[i])

for i in range(len(test_y)):
    if(test_y[i] == 8):
        filter_test_x.append(test_x[i])
        filter_test_y.append(test_y[i])

print(f"Length of filtered training data: {len(filter_train_y)}")
print(f"Length of filtered testing data: {len(filter_test_y)}")

Length of filtered training data: 1045
Length of filtered testing data: 217


In [13]:
counter = [0,0,0,0,0,0,0,0,0,0]

for i in range(len(test_y)):
    counter[test_y[i]] += 1

print(f"Number of occurances for each class in test batch: {counter}")

counter = [0,0,0,0,0,0,0,0,0,0]

for i in range(len(train_y)):
    counter[train_y[i]] += 1

print(f"Number of occurances for each class in train batch: {counter}")

Number of occurances for each class in test batch: [196, 198, 195, 199, 198, 185, 216, 193, 217, 203]
Number of occurances for each class in train batch: [981, 1003, 994, 982, 1033, 969, 999, 988, 1045, 1006]


We can see there are no large imbalances in the number of occuring classes in both the training and testing batch.

Currently, the labels for each class are integer values from 0-9, however to use these values with a NN we need to convert these values into a more useable form as node in a NN can only output a zero or a one. To be able to classify each class individual with 10 output nodes, I will use a technique called "One hot encoding" where each numeric value is converted in an array where all the values are zero except for the corresponding index. For example, a value of 3 will be converted to: [0, 0, 0, 1, 0, 0, 0, 0, 0, 0]. The 4th (starting from 0) index is 1 while every other index is 0.

In [14]:
ohn_test_y = np.array([[int(i == label_indx) for i in range(10)] for label_indx in test_y])

ohn_filter_train_y = np.array([[int(i == label_indx) for i in range(10)] for label_indx in filter_train_y])
ohn_filtered_test_y = np.array([[int(i == label_indx) for i in range(10)] for label_indx in filter_test_y])

In [15]:
print(train_y)
print(test_y)

print(train_x.shape)
print(train_y.shape)

[6 9 9 ... 5 3 2]
[3, 8, 8, 0, 6, 6, 1, 6, 3, 1, 0, 9, 5, 7, 9, 8, 5, 7, 8, 6, 7, 0, 4, 9, 5, 2, 4, 0, 9, 6, 6, 5, 4, 5, 9, 2, 4, 1, 9, 5, 4, 6, 5, 6, 0, 9, 3, 9, 7, 6, 9, 8, 0, 3, 8, 8, 7, 7, 4, 6, 7, 3, 6, 3, 6, 2, 1, 2, 3, 7, 2, 6, 8, 8, 0, 2, 9, 3, 3, 8, 8, 1, 1, 7, 2, 5, 2, 7, 8, 9, 0, 3, 8, 6, 4, 6, 6, 0, 0, 7, 4, 5, 6, 3, 1, 1, 3, 6, 8, 7, 4, 0, 6, 2, 1, 3, 0, 4, 2, 7, 8, 3, 1, 2, 8, 0, 8, 3, 5, 2, 4, 1, 8, 9, 1, 2, 9, 7, 2, 9, 6, 5, 6, 3, 8, 7, 6, 2, 5, 2, 8, 9, 6, 0, 0, 5, 2, 9, 5, 4, 2, 1, 6, 6, 8, 4, 8, 4, 5, 0, 9, 9, 9, 8, 9, 9, 3, 7, 5, 0, 0, 5, 2, 2, 3, 8, 6, 3, 4, 0, 5, 8, 0, 1, 7, 2, 8, 8, 7, 8, 5, 1, 8, 7, 1, 3, 0, 5, 7, 9, 7, 4, 5, 9, 8, 0, 7, 9, 8, 2, 7, 6, 9, 4, 3, 9, 6, 4, 7, 6, 5, 1, 5, 8, 8, 0, 4, 0, 5, 5, 1, 1, 8, 9, 0, 3, 1, 9, 2, 2, 5, 3, 9, 9, 4, 0, 3, 0, 0, 9, 8, 1, 5, 7, 0, 8, 2, 4, 7, 0, 2, 3, 6, 3, 8, 5, 0, 3, 4, 3, 9, 0, 6, 1, 0, 9, 1, 0, 7, 9, 1, 2, 6, 9, 3, 4, 6, 0, 0, 6, 6, 6, 3, 2, 6, 1, 8, 2, 1, 6, 8, 6, 8, 0, 4, 0, 7, 7, 5, 5, 3, 5, 2, 3, 4, 1, 7, 

# 4. Data Modelling

In [16]:
import tensorflow as tf
from keras.api._v2.keras import layers, Sequential, Input, optimizers, losses
from sklearn.model_selection import StratifiedShuffleSplit, GridSearchCV
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score
from keras.wrappers.scikit_learn import KerasClassifier
import datetime

# My model was having problems with evaluating labels that are one hot encoded, so I will use a regular 1-D with integers as labels.
# This is possible as I am using sparse categorical cross entropy, which allows the model to use regular integers to classify datasets.
# The same loss function is used so the model won't behave differently.
def getSparceModel():
    model = Sequential(name="32x32_Image_Classification_Model")
    model.add(layers.Dense(255, input_dim=3072, activation='relu', name='input'))

    # Squishing (or normalizing) the RGB values from 0-255 to 0-1.
    model.add(layers.Rescaling(scale=1./255, name="Normaliser"))
    model.add(layers.Flatten())

    model.add(layers.Dense(255, activation='relu'))
    model.add(layers.Dense(255, activation='relu'))
    model.add(layers.Dense(255, activation='relu'))
    model.add(layers.Dense(255, activation='relu'))

    model.add(layers.Dense(10, activation="softmax", name='output'))

    model.compile(
        optimizer=optimizers.Adam(learning_rate=0.0001), 
        loss='sparse_categorical_crossentropy',
        metrics=['sparse_categorical_accuracy', 'accuracy'],
    )

    return model

batch_size = [650, 750, 850]
epochs = [75, 100, 125]

param_grid = dict(epochs=epochs, batch_size=batch_size)
model = KerasClassifier(build_fn=getSparceModel, verbose=1)

grid = GridSearchCV(estimator=model, param_grid=param_grid, scoring='accuracy', n_jobs=1, refit='boolean')
grid_result = grid.fit(train_x, train_y)

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))


We can see from the GridSearch the best result is: Best: 0.454400 using {'batch_size': 750, 'epochs': 125}.

In [17]:
#model.save('output/my_model2')
ohn_train_y = np.array([[int(i == label_indx) for i in range(10)] for label_indx in train_y])

print(ohn_train_y.shape)

(10000, 10)


In [18]:
print(train_y.shape)

(10000,)


In [19]:
model = getSparceModel()

sss = StratifiedShuffleSplit(n_splits=15)

sss.get_n_splits(train_x, train_y)

acc = []

log_dir = "logs/fit/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir=log_dir, histogram_freq=1)

for train_index, test_index in sss.split(train_x, train_y):
    X_train, X_test = train_x[train_index], train_x[test_index]
    y_train, y_test = train_y[train_index], train_y[test_index]
    model.fit(np.array(X_train), np.array(y_train), validation_data=(np.array(test_x), np.array(test_y)), callbacks=[tensorboard_callback])
    pred = model.predict(X_test)
    pred = np.argmax(pred, axis=1)
    acc.append(accuracy_score(y_test, pred))

print(acc)

test_pred = model.predict(np.array(test_x))
model.evaluate(np.array(test_x), np.array(test_y))

[0.326, 0.351, 0.353, 0.36, 0.402, 0.422, 0.388, 0.448, 0.479, 0.463, 0.518, 0.478, 0.505, 0.488, 0.51]


[1.5870673656463623, 0.453000009059906, 0.453000009059906]

In [20]:
test_y = np.array(test_y)
filter_test_y = np.array(filter_test_y)
filter_test_x = np.array(filter_test_x)


print(filter_test_x.shape)
print(filter_test_y.shape)

print(f"Evaluation will every image from test batch: {model.evaluate(test_x, test_y)}")

print(f"Evaluation will every image from filtered test batch: {model.evaluate(filter_test_x, filter_test_y, callbacks=tensorboard_callback)}")

pred = model.predict(filter_test_x)
pred = np.argmax(pred, axis=1)
print(pred)

(217, 3072)
(217,)
Evaluation will every image from test batch: [1.5870673656463623, 0.453000009059906, 0.453000009059906]
Evaluation will every image from filtered test batch: [1.0936577320098877, 0.6912442445755005, 0.6912442445755005]
[9 8 8 8 4 8 8 8 8 8 8 8 8 1 8 0 8 9 9 8 9 8 8 8 3 8 2 5 8 0 0 4 8 7 5 8 9
 2 8 0 8 8 8 1 8 0 0 3 8 8 8 8 8 9 4 0 0 8 8 8 8 8 8 8 8 8 5 8 9 8 5 8 8 2
 8 8 8 8 9 8 8 8 8 8 8 8 4 8 8 8 8 8 8 8 8 8 8 3 8 8 8 8 8 8 8 8 8 8 8 8 8
 8 8 8 8 8 8 8 1 8 8 8 9 8 8 0 8 8 0 0 8 0 8 5 1 8 8 0 8 8 8 8 8 1 2 8 0 7
 8 8 8 8 3 8 8 8 8 8 8 8 8 8 8 8 8 8 4 8 8 8 7 8 9 4 3 9 8 8 3 8 8 6 1 8 1
 1 8 8 8 0 8 8 8 8 1 8 8 9 8 8 8 8 8 0 2 8 0 8 8 8 8 8 8 8 2 6 3]


We can see the overall accuracy of the model is around 45% which is quite low, however when testing the model with the filtered test batch which only contains the images of the class ship, the accuracy is much better at around 75%.

### Function that takes any image, resizes it to 32 x 32 and then evaluates it using the model:

In [25]:
model.save('./output/mymodel')

import cv2
from PIL import Image

def predictImg(img, model, y):
    newImg = cv2.imread(img)

    newImg = cv2.resize(newImg, (32, 32))

    odimg = np.reshape(newImg, (3072,))

    arr = np.array([odimg])

    print(arr.shape)

    pred = model.predict(arr)
    pred = np.argmax(pred, axis=1)
    
    print(f"Model prediction: {pred}, Actual class: {y}")

    return newImg

y = np.array([8])

img = predictImg('./images/ship.jpg', model=model, y=y)


INFO:tensorflow:Assets written to: ./output/mymodel\assets
(1, 3072)
Model prediction: [4], Actual class: [8]


In [26]:
cv2.imshow('img', img)
cv2.waitKey(0)
cv2.destroyAllWindows()