In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [None]:
import os
from glob import glob
from copy import copy
from sklearn.model_selection import train_test_split
from keras.preprocessing.image import ImageDataGenerator

In [None]:
random_seed = 0
np.random.seed(random_seed)

In [None]:
!ls -lh ../input

In [None]:
all_data = pd.read_csv('../input/train/train.csv')
list(all_data.columns)

In [None]:
all_data.head(2)

In [None]:
pet_data = all_data.set_index('PetID')
image_data = []

for image in glob('../input/train_images/*.jpg'):
    basename = os.path.basename(image)
    pet_id, _ = basename.rsplit('-')
    pet_row = pet_data.loc[pet_id].to_dict()
    pet_row['ImageFilename'] = image
    pet_row['ImageBasename'] = basename
    image_data.append(pet_row)

# rsplit() method returns a list of strings after breaking the given string 
# from right side by the specified separator.
#  DataFrame.loc - Access a group of rows and columns by label(s) not by index.
#  DataFrame.to_dict - Convert the DataFrame to a dictionary

image_data = pd.DataFrame(image_data)
image_data['AdoptionSpeed']= image_data['AdoptionSpeed'].astype(str)
image_data.head(2)

In [None]:
len(image_data), len(pet_data)

In [None]:
y = image_data['AdoptionSpeed']
test_size = 0.2
validation_size = 0.2

# Split the training data off from leftover (i.e. validation and testing)
# train_test_split(*arrays, **options)
# random_state is the seed used by the random number generator
# data is split in a stratified fashion, using this as the class labels
X_train, X_leftover, y_train, y_leftover = train_test_split(
    image_data, y, test_size=test_size, random_state=random_seed,
    stratify=y.values # stratify to ensure equal distribution of classes
)

# Determine how much the leftover section should be split to test
test_split = test_size / (test_size + validation_size)

X_validate, X_test, y_validate, y_test = train_test_split(
    X_leftover, y_leftover, test_size=test_split, random_state=random_seed,
    stratify=y_leftover.values # stratify to ensure equal distribution of classes
)

X_train.shape, X_validate.shape, X_test.shape
# X_train['ImageFilename']
val = X_train.at[10,'ImageFilename']
X_train['ImageFilename']
print (X_train['ImageFilename'] [:10])
print (val)
#img = mpimg.imread(val)
#plt.imshow(img)
#plt.show()

In [None]:
train_fnames = os.listdir('../input/train_images/')
# train_dog_fnames.sort()
print (train_fnames[:10])

In [None]:
train_datagen = ImageDataGenerator(
    horizontal_flip=True,
    zoom_range=0.2,
    shear_range=0.2,
)

val_datagen = ImageDataGenerator()

In [None]:
%matplotlib inline

import matplotlib.pyplot as plt
import matplotlib.image as mpimg

# Parameters for our graph; we'll output images in a 4x4 configuration
nrows = 2
ncols = 4

# Index for iterating over images
pic_index = 0

In [None]:
# Set up matplotlib fig, and size it to fit 4x4 pics
fig = plt.gcf()
fig.set_size_inches(ncols * 4, nrows * 4)

pic_index += 8
# next_cat_pix = [os.path.join('../input/train_images/', fname) 
#                 for fname in train_cat_fnames[pic_index-8:pic_index]]
next_pix = [os.path.join('../input/train_images/', fname) 
                for fname in train_fnames[pic_index-8:pic_index]]

for i, img_path in enumerate(next_pix):
  # Set up subplot; subplot indices start at 1
  sp = plt.subplot(nrows, ncols, i + 1)
  sp.axis('Off') # Don't show axes (or gridlines)

  img = mpimg.imread(img_path)
  plt.imshow(img)

plt.show()

In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

datagen = ImageDataGenerator(
      rotation_range=40,
      width_shift_range=0.2,
      height_shift_range=0.2,
      shear_range=0.2,
      zoom_range=0.2,
      horizontal_flip=True,
      fill_mode='nearest')

In [None]:
print (X_train['ImageFilename'][:10])

In [None]:
%matplotlib inline

import matplotlib.pyplot as plt
import matplotlib.image as mpimg

from tensorflow.keras.preprocessing.image import array_to_img, img_to_array, load_img

# img_path = os.path.join(train_cats_dir, train_cat_fnames[4])
img_path = X_train['ImageFilename'][6094]
img = load_img(img_path, target_size=(150, 150))  # this is a PIL image
x = img_to_array(img)  # Numpy array with shape (150, 150, 3)
x = x.reshape((1,) + x.shape)  # Numpy array with shape (1, 150, 150, 3)

# The .flow() command below generates batches of randomly transformed images
# It will loop indefinitely, so we need to `break` the loop at some point!
i = 0
for batch in datagen.flow(x, batch_size=1):
  plt.figure(i)
  imgplot = plt.imshow(array_to_img(batch[0]))
  i += 1
  if i % 5 == 0:
    break

In [None]:
BATCH_SIZE = 64

In [None]:
# Adding rescale, rotation_range, width_shift_range, height_shift_range,
# shear_range, zoom_range, and horizontal flip to our ImageDataGenerator
train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=40,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,)

# Note that the validation data should not be augmented!
test_datagen = ImageDataGenerator(rescale=1./255)

# Flow training images in batches of 32 using train_datagen generator
train_generator = train_datagen.flow_from_dataframe(
    X_train.reset_index(), # Need to reset index due to bug in flow_from_dataframe
    directory='../input/train_images/',
    x_col='ImageBasename',
    y_col='AdoptionSpeed',
    target_size=(150, 150),
    class_mode='categorical',
    batch_size=BATCH_SIZE,
)
# Flow validation images in batches of 32 using test_datagen generator
val_generator = test_datagen.flow_from_dataframe(
    X_validate.reset_index(), # Need to reset index due to bug in flow_from_dataframe
    directory='../input/train_images/',
    x_col='ImageBasename',
    y_col='AdoptionSpeed',
    target_size=(150, 150),
    class_mode='categorical',
    batch_size=BATCH_SIZE,
)
        

In [None]:
from tensorflow.keras import layers
from tensorflow.keras import Model
from tensorflow.keras.optimizers import RMSprop

# architecture stack 3 {convolution + relu + maxpooling} modules. 
# Convolutions operate on 3x3 windows and the maxpooling layers operate on 2x2 windows. 
# The first convolution extracts 16 filters, the following one extracts 32 filters, 
# and the last one extracts 64 filters.
# This is a configuration that is widely used and known to work well for image 
# classification. Also, since we have relatively few training examples (1,000), 
# using just three convolutional modules keeps the model small, which lowers 
# the risk of overfitting.
# The input feature map is 150x150x3: 150x150 for the image pixels, and 3 for
# the three color channels: R, G, and B
img_input = layers.Input(shape=(150, 150, 3))

# First convolution extracts 16 filters that are 3x3
# Convolution is followed by max-pooling layer with a 2x2 window
x = layers.Conv2D(16, 3, activation='relu')(img_input)
x = layers.MaxPooling2D(2)(x)

# Second convolution extracts 32 filters that are 3x3
# Convolution is followed by max-pooling layer with a 2x2 window
x = layers.Conv2D(32, 3, activation='relu')(x)
x = layers.MaxPooling2D(2)(x)

# Third convolution extracts 64 filters that are 3x3
# Convolution is followed by max-pooling layer with a 2x2 window
x = layers.Convolution2D(64, 3, activation='relu')(x)
x = layers.MaxPooling2D(2)(x)

# Flatten feature map to a 1-dim tensor
x = layers.Flatten()(x)

# Create a fully connected layer with ReLU activation and 512 hidden units
x = layers.Dense(512, activation='relu')(x)

# Add a dropout rate of 0.5
x = layers.Dropout(0.5)(x)

# Create output layer with a single node and relu activation
output = layers.Dense(5, activation='relu')(x)

# Configure and compile the model
model = Model(img_input, output)
model.compile(loss='categorical_crossentropy',
              optimizer=RMSprop(lr=0.0001), # optimizer='rmsprop'
              metrics=['accuracy'])


In [None]:
model.summary()

In [None]:
# TODO: Add batch size to train and validation
history=model.fit_generator(
     train_generator,
     epochs=10,
     steps_per_epoch=len(X_train)//BATCH_SIZE,
     validation_data=val_generator,
     validation_steps=len(X_validate)//BATCH_SIZE,
    # TODO: Add class weight
     workers=4,
     use_multiprocessing=True,
    shuffle=True
)

In [None]:
# Retrieve a list of accuracy results on training and test data
# sets for each training epoch
acc = history.history['acc']
val_acc = history.history['val_acc']

# Retrieve a list of list results on training and test data
# sets for each training epoch
loss = history.history['loss']
val_loss = history.history['val_loss']

# Get number of epochs
epochs = range(len(acc))

# Plot training and validation accuracy per epoch
plt.plot(epochs, acc)
plt.plot(epochs, val_acc)
plt.title('Training and validation accuracy')

plt.figure()

# Plot training and validation loss per epoch
plt.plot(epochs, loss)
plt.plot(epochs, val_loss)
plt.title('Training and validation loss')

In [None]:
# As you can see, we are overfitting like it's getting out of fashion. Our training accuracy
# (in blue) gets close to 100% (!) while our validation accuracy (in green) stalls as 70%. 
# Our validation loss reaches its minimum after only five epochs.
# Since we have a relatively small number of training examples (2000), overfitting should
# be our number one concern. Overfitting happens when a model exposed to too few examples 
# learns patterns that do not generalize to new data, i.e. when the model starts using 
# irrelevant features for making predictions. For instance, if you, as a human, only see 
# three images of people who are lumberjacks, and three images of people who are sailors, 
# and among them the only person wearing a cap is a lumberjack, you might start thinking 
# that wearing a cap is a sign of being a lumberjack as opposed to a sailor. You would 
# then make a pretty lousy lumberjack/sailor classifier.

# Overfitting is the central problem in machine learning: given that we are fitting the 
# parameters of our model to a given dataset, how can we make sure that the representations
# learned by the model will be applicable to data never seen before? How do we avoid 
# learning things that are specific to the training data?