In [None]:
!pip install kaggle

In [None]:
!chmod 600 ../kaggle/kaggle.json

In [None]:
!kaggle datasets download temesgentewolde/animal-dataset-intermediate -p ../data/raw

In [None]:
!unzip ../data/raw/animal-dataset-intermediate.zip -d ../data/raw

In [None]:
!rm ../data/raw/animal-dataset-intermediate.zip

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

import numpy as np
import pathlib
import os
import PIL
import PIL.Image

import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data_dir = "../data/raw/animal_dataset_intermediate/train"
data_dir = pathlib.Path(data_dir)
print(data_dir)

In [None]:
num_skipped = 0
for folder_name in ("elefante", "farfalla", "mucca", "pecora", "scoiattolo"):
    folder_path = os.path.join(data_dir, folder_name)
    for fname in os.listdir(folder_path):
        fpath = os.path.join(folder_path, fname)
        try:
            fobj = open(fpath, "rb")
            is_jfif = tf.compat.as_bytes("JFIF") in fobj.peek(10)
        finally:
            fobj.close()

        if not is_jfif:
            num_skipped += 1
            # Delete corrupted image
            os.remove(fpath)

print("Deleted %d images" % num_skipped)

In [None]:
image_count = len(list(data_dir.glob('*/*.jpg')) + list(data_dir.glob('*/*.jpeg')))
print("Imported image_count: ", image_count)

In [None]:
elephant = list(data_dir.glob('elefante/*'))
PIL.Image.open(str(elephant[0]))

In [None]:
batch_size = 32
img_height = 256
img_width = 256


train_ds = tf.keras.utils.image_dataset_from_directory(
  data_dir,
  validation_split=0.2, # We train our model using 80% of the train_ds and test on the remaining 20%.
  subset="training",
  seed=123,
  image_size=(img_height, img_width),
  batch_size=batch_size)

val_ds = tf.keras.utils.image_dataset_from_directory(
  data_dir,
  validation_split=0.2,
  subset="validation",
  seed=123,
  image_size=(img_height, img_width),
  batch_size=batch_size)

Let's check if OS and PIL libraries are correctly reading and displying the images. 

In [None]:
class_names = ['elefante', 'farfalla', 'mucca', 'pecora', 'scoiattolo']

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 10))
for images, labels in train_ds.take(1):
  for i in range(9):
    ax = plt.subplot(3, 3, i + 1)
    plt.imshow(images[i].numpy().astype("uint8"))
    plt.title(class_names[labels[i]])
    plt.axis("off")
plt.show()


In [None]:
# Get images(x) and lables(y) of a given batchdatasets
def get_image_label(ds): 
    x_train_ = []
    y_train_ = []
    for element in ds.as_numpy_iterator(): 
        x_train_.append(element[0])
        y_train_.append(element[1])
    x_train = np.concatenate(x_train_)
    y_train = np.concatenate(y_train_)
    
    return (x_train, y_train)

x_train, y_train = get_image_label(train_ds)
x_val, y_val = get_image_label(val_ds)

print(type(x_train), type(y_train))
print(x_train.shape, y_train.shape, x_train.ndim)
print(x_val.shape, y_val.shape, x_val.ndim)

In [None]:
assert isinstance(x_train, (np.ndarray, np.generic))
assert isinstance(y_train, (np.ndarray, np.generic))
assert isinstance(x_val, (np.ndarray, np.generic))
assert isinstance(y_val, (np.ndarray, np.generic))

assert x_train.ndim, x_val == 4
assert y_train.ndim, y_train.ndim == 1

In [None]:
#TODO: Data viz to show data imbalance - bar graph

plt.figure(figsize=(8, 3))
splot = sns.countplot(y_train)
for p in splot.patches:
    splot.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2., p.get_height()), 
    ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')

splot.set_xticklabels(class_names)
splot.set_xlabel("Classes")
splot.set_ylabel("Count")
plt.show()

In [None]:
from sklearn.utils import resample 

print("-" * 70)
print('Input shape before resampling: ' ,x_train.shape, y_train.shape)

#..reshape (flatten) x_train for SMOTE resampling
nsamples, k, nx, ny = x_train.shape
x_train = x_train.reshape((nsamples,k*nx*ny))
x_train.shape

from imblearn.over_sampling import SMOTE
smote = SMOTE('all')
x_train, y_train = smote.fit_resample(x_train, y_train)

print("-" * 70)
print('Input shape after sampling: ' ,x_train.shape, y_train.shape)
print('Class distribution after over-sampling: ')
for i in range(len(class_names)):
    print(f'Number of class {class_names[i]} examples before:{x_train[y_train == i].shape[0]}')

In [None]:
plt.figure(figsize=(8, 3))
splot = sns.countplot(y_train)
for p in splot.patches:
    splot.annotate(format(p.get_height(), '.0f'), (p.get_x() + p.get_width() / 2., p.get_height()), 
    ha = 'center', va = 'center', xytext = (0, 10), textcoords = 'offset points')

splot.set_xticklabels(class_names)
splot.set_xlabel("Classes")
splot.set_ylabel("Count")
plt.show()

In [None]:
# return to original 4D shape
x_train = x_train.reshape(7325, k, nx, ny)


In [None]:
AUTOTUNE = tf.data.AUTOTUNE

train_ds = train_ds.cache().prefetch(buffer_size=AUTOTUNE)
val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

In [None]:
data_augmentation = keras.Sequential([
    layers.RandomFlip("horizontal", input_shape=(img_height, img_width,3)),
    layers.RandomRotation(0.1),layers.RandomZoom(0.1),
    layers.RandomContrast((0.1, 0.9)),
    ])

assert(x_train.ndim == 4) # Check if augementation affected shape

In [None]:
import os
root_logdir = os.path.join(os.curdir, "my_logs")

def get_run_logdir():
    import time
    run_id = time.strftime("run_%Y_%m_%d-%H_%M_%S")
    return os.path.join(root_logdir, run_id)

run_logdir = get_run_logdir() # e.g., './my_logs/run_2019_06_07-15_15_22'

tensorboard_cb = keras.callbacks.TensorBoard(run_logdir)

In [None]:
%load_ext tensorboard
%tensorboard --logdir=./my_logs --port=6006

In [None]:
num_classes = 5

model = tf.keras.Sequential([
  data_augmentation,
  tf.keras.layers.Rescaling(1./255),
  tf.keras.layers.Conv2D(32, 3, activation='relu'),
  tf.keras.layers.MaxPooling2D(),
  tf.keras.layers.Conv2D(32, 3, activation='relu'),
  tf.keras.layers.MaxPooling2D(),
  tf.keras.layers.Conv2D(32, 3, activation='relu'),
  tf.keras.layers.MaxPooling2D(),
  layers.Dropout(0.2), # Dropout https://www.tensorflow.org/tutorials/images/classification#dropout
  tf.keras.layers.Flatten(),
  tf.keras.layers.Dense(128, activation='relu'),
  tf.keras.layers.Dense(num_classes)
])


In [None]:
model.compile(
  optimizer='adam',
  loss=tf.losses.SparseCategoricalCrossentropy(from_logits=True),
  metrics=['accuracy']
  )


# Two callbacks options to implement early Stopping to avoid overfitting. 

#1. keras.callbacks.ModelCheckpoint() saves the model when its performance on the validation set is the best so far

#2. keras.callbacks.EarlyStopping() interrupts training when it measures no progress on the validation set for a number of epochs (defined by the patience argument), and it will optionally roll back to the best model.

It's possible to  combine both callbacks to save checkpoints of your model (in case the computer crashes) and interrupt training early when there is no more progress (to avoid wasting time and resources)

In [None]:
# epochs = 50
# checkpoint_cb = keras.callbacks.ModelCheckpoint("saved_model/keras_model.h5", save_best_only=True)
# early_stopping_cb = keras.callbacks.EarlyStopping(patience=10,
#                                                   restore_best_weights=True)
# history =  model.fit(
#   x_train, y_train,
#   validation_data=(x_val, y_val),
#   epochs=epochs,
#   callbacks=[checkpoint_cb, early_stopping_cb, tensorboard_cb]
# )


checkpoint_cb saves the best model. Just in case, though, we'll also save the final model

In [None]:
# model.save('../saved_model/best_model') 

In [None]:
# model.evaluate(x_val, y_val, verbose=2) # ~78% accuracy

In [None]:
new_model = tf.keras.models.load_model('../my_keras_model.h5')
new_model.evaluate(x_val, y_val, verbose=2) # ~78% accuracy


In [None]:
# TODO: Calculate validation accuracy per class. 


In [None]:
import pandas as pd

pd.options.display.max_colwidth = 999

In [None]:
test_data_dir = "../data/raw/animal_dataset_intermediate/test"
test_data_dir = pathlib.Path(test_data_dir)
print(test_data_dir)

image_count = len(list(test_data_dir.glob('*.jpg')) + list(test_data_dir.glob('*.jpeg')))
print("Imported image_count: ", image_count)

picture = list(test_data_dir.glob('*'))
PIL.Image.open(str(picture[0]))

In [None]:
df_submission_filename = pd.read_csv("../data/raw/animal_dataset_intermediate/Testing_set_animals.csv")

classes = []

df_len = df_submission_filename.shape[0]
for i in range(df_len): 
    path = os.path.join(test_data_dir, df_submission_filename.loc[i][0])
    img = tf.keras.utils.load_img(
        path, grayscale=False, color_mode='rgb', target_size=(img_height, img_width),
        interpolation='nearest'
    )
    img_array = tf.keras.utils.img_to_array(img)
    img_array = img_array.reshape(1, k, nx, ny)
    predict_img = model.predict(img_array) 
    classes_img = np.argmax(predict_img,axis=1)
    classes.append(classes_img[0])


In [None]:
df_submission_filename['target'] = [class_names[num] for num in list(classes)]
df_submission_filename.head()

In [None]:
df_submission_filename.to_csv("submission/submission.csv", index=False)