In [None]:
%matplotlib inline

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import os
from skimage.io import imread
from skimage.transform import resize

import tensorflow as tf

from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Input, Conv2D, MaxPool2D, BatchNormalization, Flatten, Dense, Dropout
from tensorflow.keras.callbacks import TensorBoard
from tensorflow.keras.optimizers import RMSprop

from tensorflow.keras.applications import vgg19

In [None]:
np.random.seed(123456780)

In [None]:
# This prevents a nasty bug: using preprocess_input within a Dataset
# throws an error due to a global variable not being initialized 
# in the tensor context
_ = vgg19.preprocess_input(tf.zeros((1, 224, 224, 3)))

# Sneaker Type Classification
## Classifying series of Jordan basketball sneakers

Dataset location: https://www.kaggle.com/sebastiaanjohn/sneakers

In [None]:
BASE_DIR = "jordans"
DATA_DIR = "data_descriptors"

TRAIN_PCT = 0.8
VAL_PCT = 0.1
TEST_PCT = 0.1

IMAGE_SIZE = (224, 224)
IMAGE_SIZE_INPUT = (224, 224, 3)

In [None]:
if not os.path.exists(DATA_DIR):
    os.makedirs(DATA_DIR)

In [None]:
model_filenames = {}
for model_dir in os.listdir(BASE_DIR):
    model_dir_parts = model_dir.split(" ", maxsplit = 2)
    model_name = f"{model_dir_parts[0]} {model_dir_parts[1]}"
    print(model_dir)
    
    full_model_dir = os.path.join(BASE_DIR, model_dir)
    files_in_model_dir = os.listdir(full_model_dir)
    files_in_model_dir = [os.path.join(full_model_dir, file) for file in files_in_model_dir]
    files_in_model_dir = [os.path.abspath(file) for file in files_in_model_dir]
    
    if model_name not in model_filenames:
        model_filenames[model_name] = []
    
    model_filenames[model_name].extend(files_in_model_dir)

In [None]:
model_filenames_df = pd.DataFrame({"filename": [], "model": []})

for (model_name, filenames) in model_filenames.items():
    records = [{"filename": filename, "model": model_name} for filename in filenames]
    model_filenames_df = model_filenames_df.append(records)

In [None]:
model_filenames_df

In [None]:
groups_by_num_images = model_filenames_df.groupby("model").size()
plt.barh(groups_by_num_images.index, groups_by_num_images)
plt.show()

In [None]:
groups_by_num_images

In [None]:
filenames_to_test = model_filenames_df.sample(10)

In [None]:
def display_images(image_descriptors):
    for filename, model_class in image_descriptors.values:
        try:
            image = imread(filename)
            plt.imshow(image)
            plt.title(model_class)
            plt.show()
        except:
            print(f"Could not read {filename}")
            continue

In [None]:
display_images(filenames_to_test)

In [None]:
models_to_select = ["Jordan 1", "Jordan 4", "Jordan 6", "Jordan 11"]
models_map = {
    "Jordan 1": 0,
    "Jordan 4": 1,
    "Jordan 6": 2,
    "Jordan 11": 3,
}

max_num_samples_to_select = 1200

In [None]:
selected_model_filenames_df = model_filenames_df[model_filenames_df.model.isin(models_to_select)]

In [None]:
selected_model_filenames_df

In [None]:
def get_readable_files(filenames):
    readable_files = []
    for filename in filenames:
        try:
            imread(filename)
            readable_files.append(True)
        except:
            readable_files.append(False)
            continue
    return readable_files

In [None]:
readable_files = get_readable_files(selected_model_filenames_df.filename.values)

In [None]:
selected_model_filenames_df["is_readable"] = readable_files
selected_model_filenames_df.is_readable = pd.Series(selected_model_filenames_df.is_readable)

In [None]:
selected_model_filenames_df = selected_model_filenames_df[selected_model_filenames_df.is_readable]

In [None]:
selected_model_filenames_balanced = pd.DataFrame()
for (model_name, group_filenames) in selected_model_filenames_df.groupby("model"):
    readable_filenames = group_filenames[group_filenames.is_readable]
    selected_model_filenames_balanced = selected_model_filenames_balanced.append(
        readable_filenames.sample(max_num_samples_to_select, replace = False))

In [None]:
selected_model_filenames_balanced = selected_model_filenames_balanced.drop("is_readable", axis = 1)
selected_model_filenames_balanced.to_csv(os.path.join(DATA_DIR, "filenames.csv"), index = False)

In [None]:
def split_data(dataset):
    train_data = pd.DataFrame()
    val_data = pd.DataFrame()
    test_data = pd.DataFrame()

    for (model_name, group_filenames) in dataset.groupby("model"):
        group_filenames = group_filenames.sample(len(group_filenames))

        train_data_end_index = int(len(group_filenames) * TRAIN_PCT)
        val_data_end_index = train_data_end_index + int(len(group_filenames) * VAL_PCT)

        train_data_in_group = group_filenames[:train_data_end_index]
        val_data_in_group = group_filenames[train_data_end_index:val_data_end_index]
        test_data_in_group = group_filenames[val_data_end_index:]

        train_data = train_data.append(train_data_in_group)
        val_data = val_data.append(val_data_in_group)    
        test_data = test_data.append(test_data_in_group)
        
    return (train_data, val_data, test_data)

In [None]:
train_data, val_data, test_data = split_data(selected_model_filenames_balanced)

In [None]:
train_data.to_csv(os.path.join(DATA_DIR, "train.csv"), index = False)
val_data.to_csv(os.path.join(DATA_DIR, "val.csv"), index = False)
test_data.to_csv(os.path.join(DATA_DIR, "test.csv"), index = False)

In [None]:
train_data.groupby("model").size()

In [None]:
train_data_sample = train_data.sample(10)

In [None]:
train_data_sample.model = train_data_sample.model.map(models_map)

In [None]:
train_data_sample

In [None]:
display_images(train_data_sample)

In [None]:
def read_image(filename, model):
    file = tf.io.read_file(filename)
    image = tf.image.decode_jpeg(file)
    image = tf.image.resize(image, IMAGE_SIZE)
    image = image / 256.0
    return (image, model)

In [None]:
poc_dataset = tf.data.Dataset.from_tensor_slices(
    (train_data_sample.filename.values, train_data_sample.model.values))

In [None]:
poc_dataset = poc_dataset.map(read_image)
poc_dataset = poc_dataset.batch(10)
poc_dataset = poc_dataset.repeat()

In [None]:
tf.keras.backend.clear_session()

In [None]:
cnn_model = Sequential([
    Input(IMAGE_SIZE_INPUT),
    Conv2D(64, kernel_size = (3, 3), padding = "same", activation = "relu"),
    MaxPool2D(),
    Conv2D(32, kernel_size = (3, 3), padding = "same", activation = "relu"),
    MaxPool2D(),
    Conv2D(16, kernel_size = (3, 3), padding = "same", activation = "relu"),
    Flatten(),
    Dense(16, activation = "relu"),
    Dropout(0.25),
    Dense(len(models_to_select), activation = "softmax")
])

In [None]:
cnn_model.summary()

In [None]:
cnn_model.compile(optimizer = "adam", loss = "sparse_categorical_crossentropy", metrics = ["accuracy"])

In [None]:
cnn_model.fit(poc_dataset, epochs = 30, steps_per_epoch = 1, callbacks = [TensorBoard()])

In [None]:
cnn_model.evaluate(poc_dataset, steps = 1)

In [None]:
vgg_model = vgg19.VGG19()

In [None]:
def read_image_vgg(filename, model):
    file = tf.io.read_file(filename)
    image = tf.image.decode_jpeg(file)
    image = tf.image.resize(image, IMAGE_SIZE)
    image = vgg19.preprocess_input(image)
    return (image, model)

In [None]:
vgg_dataset = tf.data.Dataset.from_tensor_slices(
    (train_data_sample.filename.values, train_data_sample.model.values))
vgg_dataset = vgg_dataset.map(read_image_vgg)
vgg_dataset = vgg_dataset.batch(10)
vgg_dataset = vgg_dataset.repeat()

In [None]:
for i, layer in enumerate(vgg_model.layers):
    print(i, layer.name)

In [None]:
vgg_transfer = Model(inputs = vgg_model.layers[0].input, outputs = vgg_model.layers[22].output)

In [None]:
for layer in vgg_transfer.layers:
    layer.trainable = False

In [None]:
vgg_transfer.summary()

In [None]:
transfer_model = Sequential([
    vgg_transfer,
    Dense(8, activation = "relu"),
    Dense(len(models_to_select), activation = "softmax")
])

In [None]:
transfer_model.summary()

In [None]:
transfer_model.compile(optimizer = "adam", loss = "sparse_categorical_crossentropy", metrics = ["accuracy"])

In [None]:
transfer_model.fit(vgg_dataset, epochs = 30, steps_per_epoch = 1, callbacks = [TensorBoard(log_dir = "logs_transfer_original")])

In [None]:
transfer_model = Sequential([
    vgg_transfer,
    Dense(16, activation = "relu"),
    Dense(len(models_to_select), activation = "softmax")
])

transfer_model.compile(optimizer = RMSprop(learning_rate = 0.01), loss = "sparse_categorical_crossentropy", metrics = ["accuracy"])

In [None]:
transfer_model.fit(vgg_dataset, epochs = 30, steps_per_epoch = 1, callbacks = [TensorBoard(log_dir = "logs_transfer_larger")])