<a href="https://colab.research.google.com/github/csargin/Dog_breed_identification/blob/main/Dog_breed_identification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Intro

https://www.kaggle.com/competitions/dog-breed-identification

You are provided with a training set and a test set of images of dogs. Each image has a filename that is its unique id. The dataset comprises 120 breeds of dogs. The goal is to create a classifier capable of determining a dog's breed from a photo.

# Import Libraries

In [2]:
import math
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import image as mpimg

import tensorflow as tf
from skimage.transform import resize
from keras.applications import MobileNetV2
from keras.layers import GlobalAveragePooling2D, Dense
from keras.models import Model
from keras.optimizers import Adam
from keras.utils import to_categorical
from keras.preprocessing import image

# import warnings
import warnings
# filter warnings
warnings.filterwarnings('ignore')

import os
from tqdm import tqdm

# Load dataset from kaggle

In [None]:
#https://www.kaggle.com/discussions/general/74235

! pip install -q kaggle

! mkdir ~/.kaggle
! touch ~/.kaggle/kaggle.json

from google.colab import userdata

# make new JSON file
with open('/content/kaggle.json', 'w') as f:
  text = '{"username":"' + userdata.get("kaggle_username") + '","key":"' + userdata.get('kaggle_psw') + '"}'
  f.write(text)

! chmod 600 ~/.kaggle/kaggle.json
! cp kaggle.json ~/.kaggle/
! kaggle datasets list

In [None]:
! kaggle competitions download -c 'dog-breed-identification'

In [None]:
! unzip -o dog-breed-identification.zip -d dog-breed-identification # unzip in order to overwrite files

In [5]:
train_dir = '/content/dog-breed-identification/train'
test_dir = '/content/dog-breed-identification/test'
labels = pd.read_csv('/content/dog-breed-identification/labels.csv') # Load the labels

# Training images

In [None]:
# Display the first 5 train images and their labels
print('Train Images:')
for i in range(5):
    # Get the image filename and label
    filename = labels.iloc[i]['id'] + '.jpg'
    label = labels.iloc[i]['breed']

    # Load and display the image
    img_path = os.path.join(train_dir, filename)
    img = mpimg.imread(img_path)
    plt.imshow(img)
    plt.title(label)
    plt.show()

# Test images

In [None]:
# Display the first 5 test images
print('Test Images:')
for i in range(5):
    # Get the image filename
    filename = os.listdir(test_dir)[i]

    # Load and display the image
    img_path = os.path.join(test_dir, filename)
    img = mpimg.imread(img_path)
    plt.imshow(img)
    plt.title(filename)
    plt.show()

# Sorting of Breeds

In [None]:
breeds = sorted(labels['breed'].unique())
num_classes = len(breeds)

# Mapping func

In [9]:
# Create a mapping from breed to integer label
breed_to_label = {breed: i for i, breed in enumerate(breeds)}

# Image Sizing

In [10]:
# Set the image size and batch size
img_size = 224
batch_size = 16

# Preprocessing

In [11]:
def load_and_preprocess_image(img_path, label):
    img = tf.io.read_file(img_path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, [img_size, img_size])
    img = tf.cast(img, tf.float32) / 255.0

    # One-hot encode the label
    label = tf.one_hot(label, num_classes)

    return img, label

In [12]:
def create_dataset(df, train_dir, batch_size):
    filenames = df['id'].apply(lambda x: os.path.join(train_dir, f"{x}.jpg")).values
    labels = df['breed'].map(breed_to_label).values
    dataset = tf.data.Dataset.from_tensor_slices((filenames, labels))
    dataset = dataset.map(load_and_preprocess_image, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    dataset = dataset.batch(batch_size).prefetch(buffer_size=tf.data.experimental.AUTOTUNE)
    return dataset

# MobileNetV2 architecture

In [None]:
base_model = MobileNetV2(weights='imagenet', include_top=False)

# Activation Func

In [14]:
# Add a global average pooling layer, followed by a dense layer with softmax activation
x = base_model.output
x = GlobalAveragePooling2D()(x)
predictions = Dense(num_classes, activation='softmax')(x)

# Create a new model with the added layers
model = Model(inputs=base_model.input, outputs=predictions)

# Modelling

In [15]:
# Freeze the base model layers
for layer in base_model.layers:
    layer.trainable = False

# Compile the model using the Adam optimizer and categorical crossentropy loss
model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
# Calculate the number of batches per epoch
batches_per_epoch = math.ceil(len(labels) / batch_size)


# Train the model for 10 epochs using tf.data.Dataset
for epoch in range(10):
    print('Epoch', epoch + 1)

    # Create a progress bar object for this epoch
    pbar = tqdm(total=batches_per_epoch)

    # Shuffle the data
    labels = labels.sample(frac=1).reset_index(drop=True)

    # Load and preprocess the data using tf.data.Dataset
    dataset = create_dataset(labels, train_dir, batch_size)

    # Train the model on the dataset
    for x_batch, y_batch in dataset:
        model.train_on_batch(x_batch, y_batch.numpy())  # Convert y_batch to numpy array

        # Update the progress bar
        pbar.update(1)
    pbar.close()

In [None]:
test_filenames = os.listdir(test_dir)

# Create a function to load and preprocess a test image
def load_and_preprocess_test_image(img_path):
    img = tf.io.read_file(img_path)
    img = tf.image.decode_jpeg(img, channels=3)
    img = tf.image.resize(img, [img_size, img_size])
    img = tf.cast(img, tf.float32) / 255.0
    return img

# Create a list to store the predictions
predictions_list = []

# Load and preprocess the test data and make predictions
for filename in tqdm(test_filenames):
    img_path = os.path.join(test_dir, filename)
    img = load_and_preprocess_test_image(img_path)
    img = np.expand_dims(img, axis=0)  # Add batch dimension
    prediction = model.predict(img)
    predicted_label = breeds[np.argmax(prediction)]  # Convert prediction to label
    predictions_list.append((filename.split('.')[0], predicted_label))

# Create a DataFrame from the list of predictions
submission_df = pd.DataFrame(predictions_list, columns=['id', 'breed'])

# Save the DataFrame to a CSV file
submission_df.to_csv('Dog_breed_Submission.csv', index=False)