# Dog breeds identification

Multi-class classification problem, we are asked to identify dog breeds from images of dogs.
The dataset comprises 120 breeds of dogs.

The dataset is deposited [here](https://www.kaggle.com/competitions/dog-breed-identification).


# Config

#### Python libraries

In [None]:
## SYSTEM LIBRARIES
import glob     #for checking dir content
import os       #for dir creation
import requests #for data download
import zipfile  #for unpacking zipped files

## DATA SCIENCE AND PREPROCESSING LIBRARIES
import numpy as np
import pandas as pd
from sklearn import preprocessing
import matplotlib.pyplot as plt

## TENSORFLOW AND KERAS
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from keras.preprocessing.image import ImageDataGenerator

from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, Flatten, Dense
from keras.layers import Dropout
from tensorflow.keras.optimizers import RMSprop

#### Support functions

In [None]:
## function to plot loos and accuracy over epochs
def plot_loss_history(h, title):
  for metric in h.history.keys():
    #ignoring metrics on validation set, which are implied when
    #plotting on training set
    if metric.startswith('val_'):
      continue
    
    #if we get here we found a metric on the training set,
    #let's plot it
    plt.plot(h.history[metric], label = "Train set")
    plt.plot(h.history["val_" + metric], label = "Validation set")
    plt.xlabel('Epochs')
    plt.title(title + ' - ' + metric)
    plt.legend()
    plt.show()

#### Parameters

In [None]:
#where the data are stored
#data_url = 'http://www.jackdellequerce.com/data/reduced_chest_xray.zip'
#data_url = 'https://cloud.cnr.it/owncloud/index.php/s/TU2f6k6gMOiPHci'
data_url = 'http://www.jackdellequerce.com/data/dogs/reduced.zip'
#where to place the data
download_target_imgs = '/content/data/'
base_dir = download_target_imgs + 'reduced/'

#Keras constants
BATCH_SIZE = 32
IMAGE_SIZE = [180, 180]
IMAGE_SHAPE = (IMAGE_SIZE[0], IMAGE_SIZE[1])
INPUT_SHAPE = (IMAGE_SIZE[0], IMAGE_SIZE[1], 3)

### Getting the data

The data are first downloaded as a zipped archive, which is then uncompressed and stored in `download_target_imgs` (the zipped archive has an internal structure with root `reduced/`)

In [None]:
#!rm -r /content/data/reduced/train

In [None]:
#room for data
os.makedirs(download_target_imgs, exist_ok=True)

#downloading
r = requests.get(data_url)
open(download_target_imgs + 'local_archive.zip', 'wb').write(r.content)

In [None]:
#unpacking
z = zipfile.ZipFile(download_target_imgs + 'local_archive.zip')
z.extractall(path = download_target_imgs)

We have two subfolders:
- one contains images for training
- one contains test images

In [None]:
#at this point data is there, we are ready to get the list of files
train_filenames = glob.glob(base_dir + 'train/*')
test_filenames   = glob.glob(base_dir + 'test/*')

#whatever the original case, at this point we have the files
print('Available images for train: ' + str(len(train_filenames)))
print('Available images for test: ' + str(len(test_filenames)))

#### Reading labels

We now read in a `.csv` file with labels (breed name) corresponding to each image in the training set:

In [None]:
label_df = pd.read_csv(base_dir + 'labels_reduced.csv')
print('Training set: {}'.format(label_df.shape))

In [None]:
label_df

String labels ("words") are converted to numbers (DNNs are matrix algebra machines: they understand numbers, not words!)

In [None]:
# Encode the breed into digits
label_df['label'] = preprocessing.LabelEncoder().fit_transform(label_df.breed)
 
# Create a breed-2-index dictionary
dict_df = label_df[['label','breed']].copy()
dict_df.drop_duplicates(inplace=True)
dict_df.set_index('label',drop=True,inplace=True)
 
index_to_breed = dict_df.to_dict()['breed']

In [None]:
label_df

We have a little `Python dictionary` with the correspondance between numeric code and breed name:

In [None]:
index_to_breed

In [None]:
import seaborn as sns

#function to show bar length
def barw(ax): 
    
    for p in ax.patches:
        val = p.get_width() #height of the bar
        x = p.get_x()+ p.get_width() # x- position 
        y = p.get_y() + p.get_height()/2 #y-position
        ax.annotate(round(val,2),(x,y))
        
#finding top dog brands

plt.figure(figsize = (5,3))
ax0 =sns.countplot(y=label_df['breed'],order=label_df['breed'].value_counts().index)
barw(ax0)
plt.show()

A little preprocessing here: 
- we convert numeric codes to OHE (one-hot encoded) vectors
- we add the suffix `.jpg` to the image names (because the actual files do have this extension)

In [None]:
target = label_df['label'].to_numpy().tolist()
target = tf.keras.utils.to_categorical(target)

In [None]:
target = pd.DataFrame(target)
target = target.join(label_df['id'])
target['id'] = target['id'] + '.jpg'

In [None]:
#newcols = [str(x) for x in target.columns[:-1]]
#newcols.append('id')
#target.columns = newcols
target

Let's have a look at one training image:

In [None]:
# Lets check one image
from IPython.display import display, Image

fname = os.path.join(base_dir, 'train', target['id'][0])
Image(fname)

### ImageDataGenerator

Here we do data augmentation on training and validation data:
- image flipping
- brightness adjustments
- image rotation
- shearing and zooming
- height/width shifts
- color channel shifts 

In [None]:
colnames = [x for x in target.columns[:-1]]

In [None]:
colnames

In [None]:
#declare two objects
train_datagen      = ImageDataGenerator(
    rescale=1./255, 
    horizontal_flip=True, 
    vertical_flip=True, 
    brightness_range = [0.5, 1.5],
    #shear_range=0.1,
    zoom_range=0.2,
    #height_shift_range=0.1,
    channel_shift_range=0.4,
    rotation_range=40)

validation_datagen = ImageDataGenerator(rescale=1./255)

### Training and validation sets

Choose a number of samples to be actually used for training in a training/validation split scheme:

In [None]:
training_set = target.sample(n=275)

In [None]:
training_set

Get validation images (those not used in training):

In [None]:
df_all = target.merge(training_set['id'].drop_duplicates(), on=['id'], 
                   how='left', indicator=True)
validation_set = df_all[df_all['_merge'] == 'left_only']

In [None]:
len(validation_set)

Reading data from the dataframe (image file names and OHE labels).
If class_mode is set to “raw” it treats the data in the column or list of columns of the dataframe as raw target value (which means you should be sure that data in these columns must be of numerical datatypes), will be helpful if you’re building a model for regression task like predicting the angle from the images of steering wheel or building a model that needs to predict multiple values at the same time.

In [None]:
train_generator = train_datagen.flow_from_dataframe(
dataframe=training_set,
directory=base_dir + 'train/',
x_col="id",
y_col=colnames,
batch_size=BATCH_SIZE,
seed=42,
shuffle=False,
class_mode="raw",
target_size=IMAGE_SHAPE)

In [None]:
valid_generator = validation_datagen.flow_from_dataframe(
dataframe=validation_set,
directory=base_dir + 'train/',
x_col="id",
y_col=colnames,
#y_col = 'classes',
batch_size=6,
seed=42,
shuffle=False,
class_mode="raw",
target_size=IMAGE_SHAPE)

In [None]:
x, y = train_generator.next()
print(y[0:5])
print(x[0].shape)
#print(x[0])
plt.imshow(x[5])
plt.show()

In [None]:
target.shape

## Model building

Now we are ready to build our DNN model.
We do this by stacking multiple layers of "neurons" (nodes) one on top of the other.
Basically, in this simple example we replicate the same substructure:
- convolutional layer with varying number of nodes
- max pooling layer to reduce the complexity
- dropout layer for regularization

In [None]:
NCLASSES = 4 ## n. of dog breeds to recognise
EPOCHS = 50

In [None]:
#let's declare an empty model
model = Sequential()

model.add(Conv2D(filters=16, kernel_size=(3, 3), padding="same", activation="relu", input_shape=INPUT_SHAPE))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Conv2D(32, (3, 3), padding="same", activation="relu"))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Conv2D(64, (3, 3), padding="same", activation="relu"))
model.add(MaxPooling2D(pool_size=(2, 2)))
model.add(Dropout(0.25))
model.add(Flatten())
model.add(Dense(NCLASSES, activation='softmax'))
#model.add(Dense(units=5, activation='softmax')) #five classes classification problem
######################

In [None]:
model.summary()

We can now compile the specified model and run it!

In [None]:
# Model compile
model.compile(loss='categorical_crossentropy',
              optimizer=RMSprop(learning_rate=5e-4),
              metrics=['accuracy'])

In [None]:
train_log = model.fit(x=train_generator,
                    validation_data=valid_generator,
                    verbose=2,
                    epochs=EPOCHS
)

In [None]:
plot_loss_history(train_log, 'My model')

## Model evaluation

Let's first get the labels (breeds) in the validation set:

In [None]:
obs = []
for vec in valid_generator.labels:
  obs.append(np.argmax(vec))

In [None]:
obs = np.array(obs)

In [None]:
unique, counts = np.unique(obs, return_counts=True)
print(np.asarray((unique, counts)).T)

### Prediction accuracy

#### Accuracy in the training set

In [None]:
# Evaluate on training data
train_accuracy = []

for i in range(10):
  print("iteration",i)
  train_generator.reset()
  scores = model.evaluate(train_generator)
  print("%s%s: %.2f%%" % ("evaluate ",model.metrics_names[1], scores[1]*100))
  train_accuracy.append(scores[1])

In [None]:
print("Average training accuracy")
np.mean(train_accuracy)

#### Accuracy in the validation set

In [None]:
# Evaluate on Validation data
val_accuracy = []

for i in range(1):
  print("iteration",i)
  valid_generator.reset()
  scores = model.evaluate(valid_generator)
  print("%s%s: %.2f%%" % ("evaluate ",model.metrics_names[1], scores[1]*100))
  val_accuracy.append(scores[1])

In [None]:
print("Average validation accuracy")
np.mean(val_accuracy)

### Predictions

In [None]:
valid_generator.reset()
pred = model.predict(valid_generator,
verbose=0)

pred[0:5]

In [None]:
y_pred = []
for i in range(1):
  print("iteration",i)
  valid_generator.reset()
  pred = model.predict(valid_generator,
  verbose=0)
  y_pred.append(np.argmax(pred, axis=1))

In [None]:
y_pred

#### Confusion matrix

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

appended_data = []
for i in range(len(y_pred)):
  conf_mat = confusion_matrix(obs, y_pred[i])
  print(conf_mat)
  appended_data.append(pd.DataFrame(conf_mat))

In [None]:
cm_conc = pd.concat(appended_data)

final_cm = cm_conc.groupby(cm_conc.index).mean()
print(final_cm)

In [None]:
disp = ConfusionMatrixDisplay(confusion_matrix=conf_mat,
                              display_labels= [x for x in reversed(index_to_breed.values())])

In [None]:
disp.plot()

In [None]:
import seaborn as sn

#figure = plt.figure(figsize=(8, 8))
#sn.heatmap(conf_mat, annot=True,cmap=plt.cm.Blues)
#plt.tight_layout()
#plt.ylabel('True label')
#plt.xlabel('Predicted label')
#plt.show()

### TEST DATA

Now we test the final DNN model on the 29 test images from the `test/` folder:

In [None]:
label_test = pd.read_csv(base_dir + 'labels_test.csv')
print('Test set: {}'.format(label_test.shape))

# Encode the breed into digits
label_test['label'] = preprocessing.LabelEncoder().fit_transform(label_test.breed)
 
# Create a breed-2-index dictionary
dict_df = label_test[['label','breed']].copy()
dict_df.drop_duplicates(inplace=True)
dict_df.set_index('label',drop=True,inplace=True)
 
index_to_breed = dict_df.to_dict()['breed']

In [None]:
test = label_test['label'].to_numpy().tolist()
test = tf.keras.utils.to_categorical(test)

In [None]:
test = pd.DataFrame(test)
test = test.join(label_test['id'])
test['id'] = test['id'] + '.jpg'
test

In [None]:
colnames = [x for x in test.columns[:-1]]
colnames

In [None]:
test_datagen = ImageDataGenerator(rescale=1./255)

In [None]:
test_generator = test_datagen.flow_from_dataframe(
dataframe=test,
directory=base_dir + 'test/',
x_col="id",
y_col=colnames,
#y_col = 'classes',
batch_size=1,
seed=42,
shuffle=False,
class_mode="raw",
target_size=IMAGE_SHAPE)

In [None]:
index_to_breed

In [None]:
x, y = test_generator.next()
print(index_to_breed[np.argmax(y[0])])
plt.imshow(x[0])
plt.show()

In [None]:
test_generator.reset()
pred = model.predict_generator(test_generator,verbose=1)
y_pred = np.argmax(pred, axis=1)
y_pred

In [None]:
obs = []
for vec in test_generator.labels:
  obs.append(np.argmax(vec))
obs = np.array(obs)

In [None]:
df = pd.DataFrame({'obs' : obs, 'preds' : y_pred})
df2 = df.replace({"obs": index_to_breed, "preds" : index_to_breed})
df2

In [None]:
conf_mat = confusion_matrix(obs, y_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=conf_mat,
                              display_labels= [x for x in reversed(index_to_breed.values())])
disp.plot()