# Below is the full code for pt1 of classifying plankton using deep learning.

### Training Data Exploration

In [1]:
%pwd

In [2]:
import os, sys
current_dir = os.getcwd()
nb_dir = current_dir
data_dir = current_dir + '/data/plankton/'

In [3]:
#To carry out matrix manipulations.
import numpy as np
# To plot images and visualize data.
from matplotlib import pyplot as plt
# To access files that reside in similar paths effortlessly.
from glob import glob
# Importing the Python Imaging Library.
import PIL
# A PIL module that allows to open, rotate and display images.
from PIL import Image
# A PIL module that allows for performing image processing operations.
from PIL import ImageOps
# importing imshow to diplay images, imshow to save an image.
from skimage.io import imread, imshow
# importing color mapping from python.
from pylab import cm
# importing glob
import glob
# This allows us to view matplot lib plots in the jupyter notebook
%matplotlib inline

In [4]:
%cd $data_dir/train

In [5]:
directory_names = os.listdir(data_dir+"train")

In [6]:
# Here are the first five directory names, it may vary for you...
directory_names[1:6]

In [7]:
# The number of categries we are supposed to classify...
print("There are {} folders.").format(len(directory_names))

In [8]:
# The below will tell you how many files are there in all the folders combined.
numberofImages = 0
for folder in directory_names:
    for fileNameDir in os.walk(os.path.join(data_dir+"train", folder)):
        for fileName in fileNameDir[2]:
             # Only read in the images
            if fileName[-4:] != ".jpg":
              continue
            numberofImages += 1

# You should see There are a total of 30336 training images upon running this bit of code.
print("There are a total of {} training images").format(numberofImages)

In [9]:
# A very important thing to note is that upon running this block of code, you will find that the sizes of each image
# is varying.
for i in xrange(5):
    example_file = glob.glob(os.path.join(data_dir+"train", directory_names[i],'*.jpg'))[8]
    im = imread(example_file)
    plt.imshow(im, cmap=cm.gray)
    print(example_file)
    plt.show()

### Validation Data Creation

In [10]:
# The next step would be to to seperate validation data and plot it...
# We create a folder that contains validation data...
%mkdir ../valid

# We create the same folders from the training data within the validation data
for d in directory_names:
    os.mkdir('../valid/'+d)

In [11]:
g = glob.glob('*/*.jpg')
# We shuffle all the image files, so that we have a random order of images...
shuf = np.random.permutation(g)
# This is the first image after our random shuffle...
im = imread(shuf[0])
plt.imshow(im, cmap=cm.gray)
print(shuf[0])
plt.show()

In [12]:
for i in range(6067):
    os.rename(shuf[i], data_dir+'valid/'+shuf[i])
print("Done moving images to the validation set...")

In [13]:
numberofImages = 0
for folder in directory_names:
    for fileNameDir in os.walk(os.path.join(data_dir+"valid", folder)):
        for fileName in fileNameDir[2]:
            if fileName[-4:] != ".jpg":
              continue
            numberofImages += 1

print(numberofImages)

### Visualizing Training Data

In [14]:
for i in xrange(5):
    example_file = glob.glob(os.path.join(data_dir+"train", directory_names[i],'*.jpg'))[8]
    im = imread(example_file)
    plt.imshow(im, cmap=cm.gray)
    print(example_file)
    plt.show()

### Resizing all training, validation and test images so they are uniform.

In [15]:
def resize_save_image(image_path):
    # Opening image and retrieving its array
    image_array = np.asarray(Image.open(image_path))
    
    image_shape = image_array.shape
    
    # determining the padding of images so that the plankton are more or less centered
    if image_shape[0] > image_shape[1]:
        padH = int((image_shape[0] - image_shape[0]) / 2)
        padV = 0
        
    if image_shape[1] > image_shape[0]:
        padH = 0
        padV = int((image_shape[1] - image_shape[0]) / 2)
    
    if image_shape[1] == image_shape[0]:
        padH = 0
        padV = 0
    
    # Padding the array with white color(px value 255) according to the above
    # trigerred parameters of padV and padH
    padded_array = np.pad(image_array, ((padV, padV), (padH, padH)), mode='constant', constant_values=255)
    
    # The array is converted back to an image
    img = Image.fromarray(padded_array, 'L')
    
    # The image is resized to a square depending on the greated length side.
    square = (np.max(image_array.shape))
    img = img.resize((square, square))
    
    # Image is saved to the same path where it was retrieved from.
    img.save(image_path)

In [16]:
# We traverse through all the directories and images and resize the trainng data
for folder in directory_names:
    os.chdir(data_dir+'train/'+folder)
    g = glob.glob('*.jpg')
    for image_path in g:
        resize_save_image(image_path)

In [17]:
# Some resized training images...
for i in xrange(5):
    example_file = glob.glob(os.path.join(data_dir+"train", directory_names[i],'*.jpg'))[8]
    im = imread(example_file)
    plt.imshow(im, cmap=cm.gray)
    print(example_file)
    plt.show()

In [18]:
# Resizing the validation images
for folder in directory_names:
    os.chdir(data_dir+'valid/'+folder)
    g = glob.glob('*.jpg')
    for image_path in g:
        resize_save_image(image_path)

In [19]:
for i in xrange(5):
    example_file = glob.glob(os.path.join(data_dir+"valid", directory_names[i],'*.jpg'))[2]
    im = imread(example_file)
    plt.imshow(im, cmap=cm.gray)
    print(example_file)
    plt.show()

In [20]:
# Resizing testing images...
# Since this contains > 130,000 images, it will take a while to run...
os.chdir(data_dir+'test/')
g = glob.glob('*.jpg')
for image_path in g:
    resize_save_image(image_path)

In [21]:
# Visualize the test data
for i in xrange(5):
    example_file = os.path.join(data_dir+"test/", g[i])
    im = imread(example_file)
    plt.imshow(im, cmap=cm.gray)
    print(example_file)
    plt.show()

### Part two of my blogpost will be expanding on this notebook...