## Requirements

A [pip requirements file](https://pip.pypa.io/en/stable/user_guide/#requirements-files) can be found at: [/sashimdig/requirements.txt](../requirements.txt)

Notable requirements

|package    |version |
|----       |-----   |
|tensorflow | 0.10.0 |
| tflearn   | 0.2.1  |


----
### [TFLearn installation instructions](http://tflearn.org/installation/)
Must install older tensorflow version 0.10 (NOT the latest 1.0) to work w/ `tflearn`

```
# Mac OS X, CPU only, Python 3.4 or 3.5:
$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/cpu/tensorflow-0.10.0-py3-none-any.whl

# Mac OS X, GPU enabled, Python 3.4 or 3.5:
$ export TF_BINARY_URL=https://storage.googleapis.com/tensorflow/mac/gpu/tensorflow-0.10.0-py3-none-any.whl

sudo -H pip3 install --upgrade $TF_BINARY_URL --ignore-installed
```

In [1]:
%matplotlib inline
import numpy             as np
import pandas            as pd
import matplotlib
import seaborn           as sns
import matplotlib.pyplot as plt
import os
from   os      import getcwd
from   os      import listdir
from   os.path import isfile, join, isdir

import skimage
from   skimage import measure
from   skimage import io

from   PIL     import Image

from   sklearn.model_selection    import train_test_split
from   sklearn.metrics            import log_loss
from   sklearn.preprocessing      import LabelEncoder
from   skimage.transform          import resize

import tensorflow as tf
import tflearn
from   tflearn.data_utils         import shuffle
from   tflearn.layers.core        import input_data, dropout, fully_connected
from   tflearn.layers.conv        import conv_2d, max_pool_2d
from   tflearn.layers.estimator   import regression
from   tflearn.data_preprocessing import ImagePreprocessing
from   tflearn.data_augmentation  import ImageAugmentation

In [2]:
def get_paths(foldNames):
  
    paths = dict.fromkeys(foldNames)

    for idx,g in enumerate(foldNames):
        fileNames = [f for f in listdir(join(trainPath,g)) if isfile(join(trainPath,g, f))]
        for i,f in enumerate(fileNames):
            fileNames[i] = join(trainPath,g,f)     
        paths[g] = fileNames
        
    return paths

def read_image(src):
    """Read and resize individual images"""
    im = io.imread(src)
    im = resize(im, (ROWS, COLS))
    return im


# Setup

In [3]:
%%time

ROWS           = 2 #60   90 720
COLS           = 80 #80    160 1280
CHANNELS       = 3
NUM_CATEGORIES = 8

trainPath      = '../data/raw/train'
testPath       = '../data/raw/test_stg1'
submitPath     = '../data/raw'
fish_classes   = [f for f in listdir(trainPath) if isdir(join(trainPath, f))]
groupData      = pd.DataFrame ({'group': fish_classes})
fish_paths     = get_paths(fish_classes)

CPU times: user 41.7 ms, sys: 49.3 ms, total: 90.9 ms
Wall time: 144 ms


# Build x and y arrays

In [None]:
%%time

for idx,fish in enumerate(fish_classes):
    groupData.ix[idx,'num files'] = int(len(fish_paths[fish]))
    
files = []
Y_cat = []

for fish in fish_classes:
    fish_files = fish_paths[fish]
    files.extend(fish_files)
    
    y_fish = np.tile(fish, len(fish_files))
    Y_cat.extend(y_fish)
  
Y_cat = np.array(Y_cat) 
print(Y_cat)

['ALB' 'ALB' 'ALB' ..., 'YFT' 'YFT' 'YFT']
CPU times: user 7.24 ms, sys: 759 µs, total: 8 ms
Wall time: 7.63 ms


In [None]:
%%time

X_all = np.ndarray((len(files), ROWS, COLS, CHANNELS), dtype=np.uint8)

for i, f in enumerate(files): 
    im = read_image(f)
    X_all[i] = im
    if i%1000 == 0: print('Processed {} of {}'.format(i, len(files)))


Processed 0 of 3777


# view resampled image

In [None]:
image = X_all[0]
plt.figure(figsize=(5, 5))
plt.imshow(im, cmap='gray', interpolation='nearest')
plt.axis('off')
plt.tight_layout()
plt.show()

# Training data
* One hot encoding labels
* Split data


In [None]:
# One Hot Encoding Labels
#    Transform the categorical array Y_all into matrix of the same height, 
#    but with a boolean column for each category.
Y_all = LabelEncoder().fit_transform(Y_cat)
Y_all = tflearn.data_utils.to_categorical(Y_all, NUM_CATEGORIES)

# test_size: between 0 and 1. proportion of the dataset to include in the test split
# random_state: Pseudo-random number generator state used for random sampling. How to shoose this?
# stratify: this is ensuring that the split datasets are balanced, i.e. contains the same 
# percentage of classes

X_train, X_valid, Y_train, Y_valid = train_test_split(X_all, Y_all, 
                                                    test_size=0.2, random_state=23, 
                                                    stratify=Y_all)

## Test Data

In [None]:
%%time

# read in test photo set
test_files = [im for im in os.listdir(testPath)]
test       = np.ndarray((len(test_files), ROWS, COLS, CHANNELS), dtype=np.uint8)
for i, im in enumerate(test_files): 
    test[i] = read_image(join(testPath,im))

# TFLEARN

## Define the model


In [None]:
def dnn_test1():
    #needed to run this tensorflow operation in order to build the network and subsequently 
    #create the model, multiple times. Rebuilding without resetting the tf.Graph object produces
    #errors. Could also get around this issue by restarting kernel, but that's annoying.
    with tf.Graph().as_default():
    
    
#         # Real-time data preprocessing
#         img_prep = ImagePreprocessing()
#         img_prep.add_featurewise_zero_center()
        
        
#         # Convolutional network building
#         network = input_data(shape=[None, ROWS, COLS, CHANNELS], 
#                             data_preprocessing=img_prep)

        # input layer
        network = input_data(shape=[None, ROWS, COLS, CHANNELS]
                            )
        
        # hidden layers
        network = conv_2d(network, 32, 3, activation='relu', regularizer='L2')
        network = max_pool_2d(network, 2)
        network = conv_2d(network, 64, 3, activation='relu', regularizer='L2')
        network = conv_2d(network, 64, 3, activation='relu', regularizer='L2')
        network = max_pool_2d(network, 2)
        network = fully_connected(network, 512, activation='relu', regularizer='L2')
        network = dropout(network, 0.5)
        
        
        # output layer
        network = fully_connected(network, NUM_CATEGORIES, activation='softmax', regularizer='L2')
        network = regression(network, 
                             loss='categorical_crossentropy',
                             learning_rate=0.01)
        return tflearn.DNN(network, 
                           tensorboard_verbose=0)

# Define model
model = dnn_test1()

In [None]:
%%time

# Start training (apply gradient descent algorithm). Will want to specify multiple epochs 
# typically unless just testing


# Train using classifier
model.fit(X_train, Y_train, 
          n_epoch        = 50, 
          shuffle        = True, 
          validation_set = (X_valid, Y_valid),
          show_metric    = True, 
          batch_size     = 96)

## Predict & save to submission file

In [None]:
%%time

#model predict
test_preds1 = model.predict(test)

In [None]:
%%time

submission = pd.DataFrame(test_preds1, columns=fish_classes)
submission.insert(0, 'image', test_files)
submission.to_csv(join(submitPath,'jfa-2.0-submission.csv'), 
                 index=False) 
submission.head()

In [None]:
def sample_prediction(jImage):
    im = read_image(join(testPath, submission.image[jImage]))
    plt.figure(figsize=(5, 5))
    plt.imshow(im, cmap='gray', interpolation='nearest')
    plt.axis('off')
    plt.tight_layout()
    plt.show()
    foo = submission.iloc[jImage,1:]
    print(foo.sort_values(ascending=False))


In [None]:
num_samples = 20
for i in range(num_samples):
    sample_prediction(np.random.randint(low=0, high=1000))

In [None]:
# def show_images(images,titles=None):
#     """Display a list of images"""
#     n_ims = len(images)
#     if titles is None: titles = ['(%d)' % i for i in range(1,n_ims + 1)]
#     fig = plt.figure()
#     n = 1
#     for image,title in zip(images,titles):
#         a = fig.add_subplot(1,n_ims,n) # Make subplot
#         if image.ndim == 2: # Is image grayscale?
#             plt.gray() # Only place in this blog you can't replace 'gray' with 'grey'
#         plt.imshow(image)
#         a.set_title(title)
#         print(submission.iloc[jImage,:])

#         n += 1
#     fig.set_size_inches(np.array(fig.get_size_inches()) * n_ims)
#     plt.show()