In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from os import listdir
from os.path import basename,join,exists
import os
print(listdir("../input"))
import threading
from queue import Queue
from math import floor
import time
import pickle
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
%matplotlib inline
# Any results you write to the current directory are saved as output.

In [None]:
df = pd.read_csv('../input/sample_submission.csv')
train_dir_path = "../input/train"
test_dir_path = "../input/test"
#pickled_dir_path  = "../output/pickled_Data"
labels_df = pd.read_csv('../input/labels.csv')
dog_breeds = list(df.columns[1:])
print(len(dog_breeds))
print(dog_breeds)

In [None]:
train_img_fpaths = [ join(train_dir_path, f) for f in listdir(train_dir_path)]
test_img_fpaths = [join(test_dir_path, f) for f in listdir(test_dir_path)]
print(len(train_img_fpaths))
print(len(test_img_fpaths))

In [None]:
def dog_breed_from_id(dog_id):
    #labels_df = pd.read_csv('../input/labels.csv')
    return labels_df[labels_df['id'] ==dog_id]['breed'].values

In [None]:
import cv2 as cv
import numpy as np

In [None]:
# variables 
IMG_HEIGHT = 150
IMG_WIDTH = 150
IMG_CHANNELS = 3
BATCH_SIZE = 1000
lock = threading.Lock()

In [None]:
def img_to_array(img_path):   
    img_array = cv.imread(img_path)
    img_array = cv.resize(img_array, (IMG_HEIGHT, IMG_WIDTH))
    img_array = img_array.reshape(-1,IMG_HEIGHT,IMG_WIDTH, IMG_CHANNELS)
    return img_array

In [None]:
# initialize queue which is threadsafe 
def initialize_queue():
    queue =Queue()
    return queue

In [None]:
# get list of image ids from names of test images
def get_test_image_ids():
    return [basename(fpath).split('.')[0] for fpath in test_img_fpaths]

In [None]:
# converts image files to numpy array and based on train/test, return train array and labels,
def get_data(is_train):
    # 1 batch per thread and last thread with remaining images
    img_fpaths = train_img_fpaths if is_train else test_img_fpaths
    num_threads = floor(len(img_fpaths)/BATCH_SIZE)
    print("num of threads:", num_threads + 1)
    img_array = None
    queue = initialize_queue()
    results = []          # results from multiple threads
    print("getting training data....") if is_train else print("getting testing data....")
    
    # load queue with data for each task
    for batch_index in range(num_threads + 1):
        if batch_index == num_threads:
            file_batch = img_fpaths[(batch_index*BATCH_SIZE):]
        else:
            file_batch = img_fpaths[(batch_index*BATCH_SIZE) : (batch_index + 1)*BATCH_SIZE]
        queue.put(file_batch)
    
    # iterate over loop to create threads
    for thread_index in range(num_threads+1):
        thread = threading.Thread(target = get_train_data_parallely, args=(queue, results)) if is_train else threading.Thread(target =get_testing_data_parallely, args =(queue, results))    
        thread.start()
        print("{} started".format(thread.name))
       # worker_threads.append(thread)
        
    # when queue in empty
    queue.join()
    return results

In [None]:
# convert training data into list of tuples
# this subroutine represents a task for child thread to collect training data
def get_train_data_parallely(queue, results):
    result = []
    while not queue.empty():
        fpaths = queue.get()
        for f_path in fpaths:
            img_array = img_to_array(f_path)
            # train_img_array = img_array if train_img_array is None else np.vstack((train_img_array, img_array))
            img_name = basename(f_path)
            img_id = img_name.split('.')[0]
            dog_breed = dog_breed_from_id(img_id)
            #train_labels.append(dog_breed)
            results.append((img_array, dog_breed))
            
    # append arr,labels for current task to results
    print("{} finished".format(threading.currentThread().getName()))
    # signal for task has been done
    queue.task_done()

In [None]:
# worker job for converting test imgs to array
def get_testing_data_parallely(queue, results):
    while not queue.empty():
        file_batch = queue.get()
        for f_path in file_batch:        
            img_name = basename(f_path)
            img_id = img_name.split('.')[0]
            results.append((img_id, img_to_array(f_path)))
    print("{} finished".format(threading.currentThread().getName()))
    queue.task_done()

In [None]:
# method for getting training data
def get_training_data():
    train_results = get_data(is_train = True)
    train_labels = []
    img_arrays= []
    for u_index in range(len(train_results)):
        img_arr, identified_breed = train_results[u_index]
        img_arrays.append(img_arr)
        train_labels.append(identified_breed)
    train_arr = np.array(img_arrays).reshape(-1, IMG_HEIGHT, IMG_WIDTH, IMG_CHANNELS)
    train_arr = train_arr/255
    train_labels = one_hot_encode_labels(train_labels)
    return train_arr,train_labels

In [None]:
# method for getting testing arr
def get_testing_data():
    results = get_data(is_train = False)
    test_img_ids = []
    test_img_list = []
    for test_result in results:
        img_id, img_arr = test_result
        test_img_list.append(img_arr)
        test_img_ids.append(img_id)
    test_img_arr = np.array(test_img_list).reshape(-1, IMG_HEIGHT, IMG_WIDTH, IMG_CHANNELS)
    test_img_arr = test_img_arr/255
    return test_img_arr, test_img_ids

In [None]:
def save_obj_to_disk(fname, obj):
    print("saving "+ fname +" to filesystem")
    if  exists(fname):
        print(fname + "already exists") 
    with open(fname, 'wb') as f:
        pickle.dump(obj, f)

In [None]:
def load_obj_from_disk(fname):
    if exists(fname):
        print("loading "+fname + " from filesystem")
        obj = None
        with open(fname, 'rb') as f:
            obj = pickle.load(f)
        return obj
    else:
        print(fname + "doesnt not exists")

In [None]:
def load_train_test_data(load_train=False, load_test=False, one_hot_encode=False):
    train_arr = None
    train_labels = None
    test_arr = None
    
    # check if training data and labels exists already as pickled file
    if load_train:
        if exists("train_data.pickle") and exists("train_labels.pickle"):
            train_arr = load_obj_from_disk("train_data.pickle")
            train_labels = load_obj_from_disk("train_labels.pickle")
            if one_hot_encode:
                train_labels = one_hot_encode_labels(train_labels)
        else:
            # create training_data and save it to filesystem
            train_arr, train_labels = get_data(is_train= True)
            if not exists("train_data.pickle"):
                save_obj_to_disk("train_data.pickle", train_data)
            if not exists("train_labels.pickle"):
                save_obj_to_disk("train_labels.pickle", train_labels)
        print(" train array shape : {}, train array labels: {}".format(train_data.shape,len(train_labels)))
        
    # check if testing data and labels exists already as pickled file
    if load_test:
        if exists("test_data.pickle"):
            test_arr = load_obj_from_disk("test_data.pickle")
        else:
            # create test_data and save it to filesystem
            test_arr = get_data(is_train= False)
            save_obj_to_disk("test_data.pickle", test_arr)
        print(" test array shape : {}".format(test_arr.shape))
    return train_arr, train_labels, test_arr

In [None]:
# method for onehot encoding labels of train_arr
def one_hot_encode_labels(label_arr):
    from sklearn.preprocessing import LabelEncoder ,OneHotEncoder
    labelEncoder = LabelEncoder()
    integer_encoded = labelEncoder.fit_transform(np.array(label_arr))
    integer_encoded = integer_encoded.reshape(-1,1)
    onehotEncoder = OneHotEncoder()
    onehot_encoded_arr = onehotEncoder.fit_transform(integer_encoded).toarray()
    return onehot_encoded_arr

In [None]:
x, y =get_training_data()

In [None]:
train_x, valdn_x, train_y, valdn_y = train_test_split(x,y,test_size=0.3)

In [None]:
test_x, test_img_ids = get_testing_data()

In [None]:
print(train_x.shape)
print(train_y.shape)
print(valdn_x.shape)
print(valdn_y.shape)
print(test_x.shape)
print(len(test_img_ids))

In [None]:
# import required packages
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten, BatchNormalization
from keras.layers import Conv2D, MaxPooling2D
from keras.utils import np_utils

In [None]:
# CNN model
model = Sequential()

# -----------------------------------------------------------------------------------
# conv 1
model.add(Conv2D(16, (3,3), input_shape=(IMG_HEIGHT, IMG_WIDTH, IMG_CHANNELS)))       # input -N,150,150,3, output- N,148,148,16
model.add(BatchNormalization(axis=3))
model.add(Activation('relu'))
#model.add(Dropout(0.5))

# max pool 1
model.add(MaxPooling2D(pool_size=(2,2),strides=2))                                   #input- N,148,148,16, output- N, 74,74,16

# -----------------------------------------------------------------------------------
# # conv 2
model.add(Conv2D(32, (3,3)))                                                         #input- N,74,74,16 output - N, 72,72,16
model.add(BatchNormalization(axis=3))
model.add(Activation('relu'))
#model.add(Dropout(0.5))

# max pool 2
model.add(MaxPooling2D(pool_size=(2,2),strides=2))                                 #input - N,72,72,16, output- N,36,36,16
# -----------------------------------------------------------------------------------

# conv 3
model.add(Conv2D(48, (3,3)))                                                       #input - N,36,36,16, output- N,34,34,32
model.add(BatchNormalization(axis=3))
model.add(Activation('relu'))
#model.add(Dropout(0.7))

# max pool 3
model.add(MaxPooling2D(pool_size=(2,2),strides=2))                                #input- N,34,34,32, output- N,17,17,32
# -----------------------------------------------------------------------------------

# # conv 4
model.add(Conv2D(64, (3,3)))                                                     #input- N,17,17,32, output- N,15,15,32
model.add(BatchNormalization(axis=3))
model.add(Activation('relu'))
#model.add(Dropout(0.7))
# max pool 4
model.add(MaxPooling2D(pool_size=(2,2),strides=2))                              #input- N,15,15,32, output- N,7,7,32

# flatten
model.add(Flatten())                                                            # output- 1568

# fc layer 1
model.add(Dense(1024, activation='relu'))                                  

# fc layer 2
model.add(Dense(512, activation='relu'))

# fc layer 3
model.add(Dense(256, activation='relu'))

# fc layer 4
model.add(Dense(120, activation='softmax'))

In [None]:
model.summary()

In [None]:
# compile model for with softmax cross entropy and adam optimizer, set accuracy as parameter to evaluate
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [None]:
# train model on training data
model_hist = model.fit(train_x, train_y, batch_size=64, nb_epoch=100, verbose=1, validation_data=(valdn_x, valdn_y))

In [None]:
predictions = model.predict(test_x, batch_size=32, verbose=1)

In [None]:
print(predictions.shape)
print(len(dog_breeds))

In [None]:
import pandas as pd
submission_res = pd.DataFrame(data= predictions, index =test_img_ids, columns= dog_breeds)
submission_res.index.name = 'id'
submission_res.to_csv('submission.csv', encoding='utf-8', index=True)

In [None]:
# summarize history for accuracy
plt.plot(model_hist.history['acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()


In [None]:
# summarize history for loss
plt.plot(model_hist.history['loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()