Sources used
* https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/image/ImageDataGenerator
* https://datascience.stackexchange.com/questions/65979/what-is-the-correct-way-to-call-keras-flow-from-directory-method
* https://stackoverflow.com/questions/42443936/keras-split-train-test-set-when-using-imagedatagenerator
* https://github.com/keras-team/keras/issues/5862#issuecomment-647559571
* https://keras.io/api/preprocessing/image/
* https://stackoverflow.com/questions/57092637/how-to-fit-keras-imagedatagenerator-for-large-data-sets-using-batches

In [1]:
import sys
!{sys.executable} -m pip install matplotlib
!{sys.executable} -m pip install pandas
!{sys.executable} -m pip install numpy

You should consider upgrading via the '/home/tkashif/.conda/envs/ti-feeds-bert/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/home/tkashif/.conda/envs/ti-feeds-bert/bin/python -m pip install --upgrade pip' command.[0m
You should consider upgrading via the '/home/tkashif/.conda/envs/ti-feeds-bert/bin/python -m pip install --upgrade pip' command.[0m


In [2]:
import os
import time
import shutil
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img, img_to_array, array_to_img

In [3]:
start = time.time()

In [4]:
# define constants
ORIGINAL_PARENT_DIR = './FRUIT-16K' # enter path to original dataset
PREPROCESSED_PARENT_DIR = './FRUIT-16K-PREPROCESSED' # enter path to folder where you want the preprocessed data to go (code will create it)

CLASSES = []
for directory in os.listdir(ORIGINAL_PARENT_DIR):
    path = os.path.join(ORIGINAL_PARENT_DIR, directory)
    if os.path.isdir(path) and not directory.startswith('.'):
        CLASSES.append(directory)
print(CLASSES)

# define hyper-parameters
BATCH_SIZE = 32
IMAGE_HEIGHT = 224
IMAGE_WIDTH = 224
SEED = 42
TEST_SIZE = 0.2

['F_Tomato', 'S_Strawberry', 'F_Mango', 'F_Orange', 'S_Banana', 'F_Tamarillo', 'S_Orange', 'S_Tomato', 'S_Lemon', 'S_Tamarillo', 'S_Lulo', 'F_Lemon', 'F_Lulo', 'S_Mango', 'F_Strawberry', 'F_Banana']


In [5]:
def read_image(path):
    img = load_img(path, target_size = (IMAGE_HEIGHT, IMAGE_WIDTH))
    img_arr = img_to_array(img)
    return img_arr

def get_images_df(path):
    # walk through the path and create a dataframe
    # that has one column for the path to the image
    # and the other with the class name
    images_data = []
    for root, dirs, files in os.walk(path):
        for file in files:
            if not file.startswith('.'):
                # add to dict [ image | class_name ]
                class_name = root.split(os.sep)[-1]
                image_path = os.path.join(root, file)
                images_data.append({
                    'image_path': image_path,
                    'class': class_name
                })
    df = pd.DataFrame(images_data)
    
    return df

def create_train_test_data(x_train, x_test, y_train, y_test, classes, save_path): 
    # create the train and test folders
    train_path = os.path.join(save_path, 'train')
    test_path = os.path.join(save_path, 'test')
    os.mkdir(train_path)
    os.mkdir(test_path)
        
    # for each class, make a subfolder
    for class_name in classes:
        os.mkdir(os.path.join(train_path, class_name))
        os.mkdir(os.path.join(test_path, class_name))
    
    # transfer files from dataframe image path to the appropriate
    # subfolder in save_path (sub folders based on class name)
    # also create x_train_final and x_test_final which will
    # store pixels of image
    x_train_final = []
    x_test_final = []
    for x_val, y_val in zip(x_train, y_train):
        new_path = os.path.join(train_path, y_val)
        shutil.copy(x_val, new_path)
        filename = x_val.split(os.sep)[-1]
        img_arr = read_image(os.path.join(new_path, filename))
        x_train_final.append(img_arr)
    for x_val, y_val in zip(x_test, y_test):
        new_path = os.path.join(test_path, y_val)
        shutil.copy(x_val, new_path)
        filename = x_val.split(os.sep)[-1]
        img_arr = read_image(os.path.join(new_path, filename))
        x_test_final.append(img_arr)
    
    return np.asarray(x_train_final), np.asarray(x_test_final), y_train, y_test
        
def transfer_data(original_dir, new_dir, classes, test_size):
    # get the images as a dataframe
    # this dataframe has two columns, one with
    # the path to the image in original_dir and one with
    # the class name
    original_df = get_images_df(original_dir)
    
    # split the df into train and test
    x, y = original_df['image_path'], original_df['class']
    x_train, x_test, y_train, y_test = train_test_split(x, y, random_state = 42, test_size = test_size)
    
    # create the new_dir folder
    if os.path.exists(new_dir):
        shutil.rmtree(new_dir)
    os.mkdir(new_dir)
    
    # transfer all the data from original_df to new_dir
    # with the appropriate formatting and get back the
    # split with the x arrays having the pixels associated
    # with the images and the y arrays having the label
    x_train, x_test, y_train, y_test = create_train_test_data(x_train, x_test, y_train, y_test, classes, save_path = new_dir)
    
    return x_train, x_test, y_train, y_test

In [6]:
x_train, x_test, y_train, y_test = transfer_data(ORIGINAL_PARENT_DIR, PREPROCESSED_PARENT_DIR, classes = CLASSES, test_size = TEST_SIZE)

In [7]:
# augment and preprocess the training data
train_image_data_generator = ImageDataGenerator(rescale = 1./255, rotation_range=30,
                                          zoom_range=0.15, width_shift_range=0.2,
                                          height_shift_range=0.2, shear_range=0.15,
                                          horizontal_flip=True, fill_mode="nearest")
print("done with train image generator")

# fit the train_image_data_generator to the train set
# only run this line of code if featurewise_center or featurewise_std_normalization or zca_whitening set to True
# train_image_data_generator.fit(x_train)
print("done with fit")

# do NOT augment the testing data
test_image_data_generator = ImageDataGenerator(rescale = 1./255)


# create the training generator and the testing generator from their respective folders
train_generator = train_image_data_generator.flow_from_directory(os.path.join(PREPROCESSED_PARENT_DIR, 'train'), 
                                                                 target_size = (IMAGE_HEIGHT, IMAGE_WIDTH), 
                                                                 classes = CLASSES, batch_size = BATCH_SIZE, 
                                                                 shuffle = True, seed = SEED)

#print(PREPROCESSED_PARENT_DIR)


test_generator = test_image_data_generator.flow_from_directory(os.path.join(PREPROCESSED_PARENT_DIR, 'test'), 
                                                               target_size = (IMAGE_HEIGHT, IMAGE_WIDTH), 
                                                               classes = CLASSES, batch_size = BATCH_SIZE, 
                                                               shuffle = True, seed = SEED)

print("done with flow")

done with train image generator
done with fit
Found 12800 images belonging to 16 classes.
Found 3200 images belonging to 16 classes.
done with flow


In [8]:
end = time.time()

In [9]:
print(end - start)

84.63485050201416


A few potentially useful links/notes for the ML team
* https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/image/ImageDataGenerator (check out the Example of using `.flow_from_directory(directory)` section) for how (I think) you can use the fit function with generators)
* You can access the mapping from class to indices through `train_generator.class_indices` or `train_generator.class_indices`
* https://stackoverflow.com/questions/61864244/how-to-avoid-augmenting-data-in-validation-split-of-keras-imagedatagenerator
* https://www.pyimagesearch.com/2018/12/24/how-to-use-keras-fit-and-fit_generator-a-hands-on-tutorial/
* https://towardsdatascience.com/keras-data-generators-and-how-to-use-them-b69129ed779c

In [None]:
# verify that the class index mapping are the same (should always be the case)
print(test_generator.class_indices == train_generator.class_indices)
print(train_generator.class_indices)
print(len(train_generator.class_indices))

In [None]:
for _ in range(10):
    img, label = test_generator.next()
    print(label[0])
    plt.imshow(array_to_img(img[0]))
    plt.show()

In [None]:
for _ in range(10):
    img, label = train_generator.next()
    print(img.shape)
    print(label[0])
    plt.imshow(array_to_img(img[0]))
    plt.show()

In [None]:
from tensorflow.python.client import device_lib

device_lib.list_local_devices()

In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten
from keras.layers import Conv2D, MaxPooling2D
import tensorflow as tf

model = Sequential()
model.add(Conv2D(32, kernel_size=(3, 3),
                 activation='relu',
                 input_shape=(224,224,3)))
model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.20))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.40))
model.add(Dense(len(train_generator.class_indices), activation='softmax'))

print(len(train_generator.class_indices))

model.compile(loss=keras.losses.categorical_crossentropy,
              optimizer=tf.keras.optimizers.Adam(),
              metrics=['accuracy'])

callback = tf.keras.callbacks.EarlyStopping(monitor='loss', patience=5)

model.fit(train_generator,
          batch_size=32,
          epochs=750,
          verbose=1,
          validation_data=test_generator,
          callbacks=[callback])

score = model.evaluate(test_generator, verbose=0)
print('Test loss:', score[0])
print('Test accuracy:', score[1])

In [None]:
import numpy as np

img, label = train_generator.next()
print(label[0])
print(img[0].shape)

singleImage = img[0]
singleImage = np.expand_dims(singleImage, axis = 0) # Note that this is done because the model is trained on batches of images. 
print(singleImage.shape) # But when we are passing only 1 image our tensor size is only (224,224,3).
test = model.predict(singleImage) # What we really want to pass in is (BATCH_SIZE,224,224,3), in this case the batch is 1. 

test.shape
print(test)

np.argmax(test) # This is done to get the array index of the highest probability output since we are using softmax. 

plt.imshow(img[0])
plt.show()

print("Prediction: ",(CLASSES[np.argmax(test)]))