In [47]:
# The code was tested on local machine with Python 3.6.8.  
# The model was trained on 1600 samples (800 dogs and 800 cats), verified on 400 samples. 
# It was tested on manually labelled by me 100 samples and gave accuracy 78%. 
# The metric has to impove significantly after training on the whole dataset.  

# Import libraries

import numpy as np
import pandas as pd 
from keras.preprocessing.image import ImageDataGenerator, load_img
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import random
import os
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Dropout, Flatten, Dense, Activation, BatchNormalization
from keras.callbacks import EarlyStopping, ReduceLROnPlateau

# Define constants

FAST_RUN = False
IMAGE_WIDTH=128
IMAGE_HEIGHT=128
IMAGE_SIZE=(IMAGE_WIDTH, IMAGE_HEIGHT)
IMAGE_CHANNELS=3

# Use this code for traning on a part of the samples

# prepare traning data with 2000 samples: 50% dogs, 50% cats
#TRAIN_DIR = 'input/train/'
#TEST_DIR = 'input/test/'

#train_images = [TRAIN_DIR+i for i in os.listdir(TRAIN_DIR)] # use this for full dataset
#train_dogs =   [i for i in os.listdir(TRAIN_DIR) if 'dog' in i]
#train_cats =   [i for i in os.listdir(TRAIN_DIR) if 'cat' in i]

#test_images =  [i for i in os.listdir(TEST_DIR)]

#train_images = train_dogs[:1000] + train_cats[:1000]
#random.shuffle(train_images)

#categories = []
#for filename in train_images:
    #category = filename.split('.')[0]
    #if 'dog' in category :
        #categories.append(1)
    #else:
        #categories.append(0)

#df = pd.DataFrame({
    #'filename': train_images,
    #'category': categories
#})



# Prepare traning data

filenames = os.listdir("input/train")
categories = []
for filename in filenames:
    category = filename.split('.')[0]
    if category == 'dog':
        categories.append(1)
    else:
        categories.append(0)

df = pd.DataFrame({
    'filename': filenames,
    'category': categories
})

# Vizualize training data

#df['category'].value_counts().plot.bar()

#sample = random.choice(filenames)
#image = load_img("input/train/" + sample)
#plt.imshow(image)

# Build model

model = Sequential()

model.add(Conv2D(32, (3, 3), activation='relu', input_shape=(IMAGE_WIDTH, IMAGE_HEIGHT, IMAGE_CHANNELS)))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

model.add(Conv2D(64, (3, 3), activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

model.add(Conv2D(128, (3, 3), activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))

model.add(Flatten())
model.add(Dense(512, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.5))
model.add(Dense(2, activation='softmax')) # 2 because we have cat and dog classes

model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])

model.summary()

# Make callbacks

earlystop = EarlyStopping(patience=10)
learning_rate_reduction = ReduceLROnPlateau(monitor='val_acc', 
                                            patience=2, 
                                            verbose=1, 
                                            factor=0.5, 
                                            min_lr=0.00001)
callbacks = [earlystop, learning_rate_reduction]

# Prepare data

df["category"] = df["category"].replace({0: 'cat', 1: 'dog'})

train_df, validate_df = train_test_split(df, test_size=0.20, random_state=42)
train_df = train_df.reset_index(drop=True)
validate_df = validate_df.reset_index(drop=True)

total_train = train_df.shape[0]
total_validate = validate_df.shape[0]
batch_size=15

# Vizualize train and validation samples

#train_df['category'].value_counts().plot.bar()
#validate_df['category'].value_counts().plot.bar()


# Trainig generator

train_datagen = ImageDataGenerator(
    rotation_range=15,
    rescale=1./255,
    shear_range=0.1,
    zoom_range=0.2,
    horizontal_flip=True,
    width_shift_range=0.1,
    height_shift_range=0.1
)

train_generator = train_datagen.flow_from_dataframe(
    train_df, 
    "input/train/", 
    x_col='filename',
    y_col='category',
    target_size=IMAGE_SIZE,
    class_mode='categorical',
    batch_size=batch_size
)

# Validation generator

validation_datagen = ImageDataGenerator(rescale=1./255)
validation_generator = validation_datagen.flow_from_dataframe(
    validate_df, 
    "input/train/", 
    x_col='filename',
    y_col='category',
    target_size=IMAGE_SIZE,
    class_mode='categorical',
    batch_size=batch_size
)

# See how generator work

#example_df = train_df.sample(n=1).reset_index(drop=True)
#example_generator = train_datagen.flow_from_dataframe(
    #example_df, 
    #"input/train/", 
    #x_col='filename',
    #y_col='category',
    #target_size=IMAGE_SIZE,
    #class_mode='categorical'
#)

#plt.figure(figsize=(12, 12))
#for i in range(0, 15):
    #plt.subplot(5, 3, i+1)
    #for X_batch, Y_batch in example_generator:
        #image = X_batch[0]
        #plt.imshow(image)
        #break
#plt.tight_layout()
#plt.show()


# Fit model

epochs=3 if FAST_RUN else 50
history = model.fit_generator(
    train_generator, 
    epochs=epochs,
    validation_data=validation_generator,
    validation_steps=total_validate//batch_size,
    steps_per_epoch=total_train//batch_size,
    callbacks=callbacks
)

model.save_weights("model.h1")

# Vizualize loss and varification of training and verification
#fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(12, 12))
#ax1.plot(history.history['loss'], color='b', label="Training loss")
#ax1.plot(history.history['val_loss'], color='r', label="validation loss")
#ax1.set_xticks(np.arange(1, epochs, 1))
#ax1.set_yticks(np.arange(0, 1, 0.1))

#ax2.plot(history.history['acc'], color='b', label="Training accuracy")
#ax2.plot(history.history['val_acc'], color='r',label="Validation accuracy")
#ax2.set_xticks(np.arange(1, epochs, 1))

#legend = plt.legend(loc='best', shadow=True)
#plt.tight_layout()
#plt.show()

# Use this code for testing on a part of the samples

#prepare testing data with 100 samples
#test_images =  test_images[:100]
#test_df = pd.DataFrame({
    #'filename': test_images
#})
#nb_samples = test_df.shape[0]

# Prepare testing data

test_filenames = os.listdir("input/test")
test_df = pd.DataFrame({
    'filename': test_filenames
})
nb_samples = test_df.shape[0]

# Create testing generator

test_gen = ImageDataGenerator(rescale=1./255)
test_generator = test_gen.flow_from_dataframe(
    test_df, 
    "input/test", 
    x_col='filename',
    y_col=None,
    class_mode=None,
    target_size=IMAGE_SIZE,
    batch_size=batch_size,
    shuffle=False
)

# Predict

predict = model.predict_generator(test_generator, steps=np.ceil(nb_samples/batch_size))

test_df['category'] = np.argmax(predict, axis=-1)

label_map = dict((v,k) for k,v in train_generator.class_indices.items())
test_df['category'] = test_df['category'].replace(label_map)

# Vizualize results 

#test_df['category'].value_counts().plot.bar()

# Results 

test_df['category'] = test_df['category'].replace({ 'dog': 1, 'cat': 0 })

submission_df = test_df.copy()
submission_df['id'] = submission_df['filename'].str.split('.').str[0]
submission_df['label'] = submission_df['category']
submission_df.drop(['filename', 'category'], axis=1, inplace=True)
submission_df.to_csv('submission.csv', index=False)

# Code for visualizing results of testing for 100 images

#sample_test = test_df
#sample_test.head()
#plt.figure(figsize=(12, 100))
#for index, row in sample_test.iterrows():
    #filename = row['filename']
    #category = row['category']
    #img = load_img("input/test/"+filename, target_size=IMAGE_SIZE)
    #plt.subplot(34, 3, index+1)
    #plt.imshow(img)
    #plt.xlabel(filename + '(' + "{}".format(category) + ')' )
#plt.tight_layout()
#plt.show()


['.DS_Store', 'test', 'train', 'sample_submission.csv']
