### Download and Extract the Stanford Dog Breed Dataset from the URL Below

* When connecting the Kaggle dataset via Object Storage, reading images one by one takes a long time, which slows down model training.
* To speed up training, download and extract the images directly to the local disk and use them in the model.


In [None]:
# Download the Stanford Dog Breed dataset
!wget http://vision.stanford.edu/aditya86/ImageNetDogs/images.tar
# Extract the archive directly into the current directory (/kaggle/working)
!ls; tar -xvf images.tar


In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os

for dirname, _, filenames in os.walk('/kaggle/working/Images'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

### Create a DataFrame of Metadata (Image Absolute Path and Label) Based on Image Directories and Filenames

* Under the `/kaggle/working/Images` directory, there are subdirectories for each dog breed containing the image files.
* The label values are generated by extracting the subdirectory name immediately above each image file in its absolute path.


In [None]:
file_path='/kaggle/working/Images/n02099429-curly-coated_retriever/n02099429_841.jpg'
# Find the first '/' after the 20th character in the path
start_pos = file_path.find('/', 20)  
# Find the last '/' in the path (from the right)
end_pos = file_path.rfind('/')  
# Extract the substring between start_pos and end_pos (e.g., folder name)
imsi_breed = file_path[start_pos+1:end_pos]
# From the extracted string, take the part after '-' as the actual label value
breed = imsi_breed[imsi_breed.find('-')+1:]
print(start_pos, end_pos, imsi_breed, breed)


In [None]:
import pandas as pd
import numpy as np
import os 

IMAGE_DIR = '/kaggle/working/Images' 

def make_dogbreed_dataframe(image_dir=IMAGE_DIR):
    paths = []
    label_gubuns = []
    for dirname, _, filenames in os.walk(image_dir):
        for filename in filenames:
            # Some files in the directory may not be images.
            if '.jpg' in filename:
                # Assign the absolute file path to the file_path variable.
                file_path = dirname + '/' + filename
                paths.append(file_path)
                # First extraction for label generation:
                # split by '/' and take the subdirectory name right above the file.
                start_pos = file_path.find('/', 20)
                end_pos = file_path.rfind('/')
                imsi_breed = file_path[start_pos+1:end_pos]
                # From the extracted string, take the part after '-' as the label value.
                breed = imsi_breed[imsi_breed.find('-')+1:]
                #print(start_pos, end_pos, imsi_breed, breed)
                label_gubuns.append(breed)

    data_df = pd.DataFrame({'path': paths, 'label': label_gubuns})
    
    # Map each label to a numeric target value
    sorted_label = np.sort(data_df['label'].unique())
    label_mapping = {label: index for index, label in enumerate(sorted_label)}
    data_df['target'] = data_df['label'].map(label_mapping)
    # Sort the DataFrame by target and path for consistency
    data_df = data_df.sort_values(by=['target', 'path'], ascending=True)
    
    return data_df


In [None]:
import pandas as pd

pd.set_option('display.max_colwidth', 200)

data_df = make_dogbreed_dataframe()
print('data_df shape:', data_df.shape)
data_df.head()

### Check the Distribution of Individual Dog Breeds


In [None]:
print(data_df.shape)
# count by breed
data_df[['label', 'target']].value_counts()

In [None]:

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline 

plt.figure(figsize=(26, 4))

sns.countplot(data=data_df, x='label')
plt.xticks(rotation=90)
plt.show()

### Display dog breed image

In [None]:
import cv2

def show_grid_images(image_path_list, ncols=8, title=None):
    figure, axs = plt.subplots(figsize=(22, 4), nrows=1, ncols=ncols)
    for i in range(ncols):
        image = cv2.cvtColor(cv2.imread(image_path_list[i]), cv2.COLOR_BGR2RGB)
        axs[i].imshow(image)
        axs[i].set_title(title)  

In [None]:
breed_image_list_01 = data_df[data_df['label']=='Staffordshire_bullterrier']['path'].iloc[:6].tolist()
breed_image_list_02 = data_df[data_df['label']=='American_Staffordshire_terrier']['path'].iloc[:6].tolist()

show_grid_images(breed_image_list_01, ncols=6, title='Staffordshire_bullterrier')
show_grid_images(breed_image_list_02, ncols=6, title='American_Staffordshire_terrier')

In [None]:
breed_list = data_df['label'].value_counts().index.tolist()

for iter_cnt, breed in enumerate(breed_list):
    breed_image_list = data_df[data_df['label']==breed]['path'].iloc[:6].tolist()
    show_grid_images(breed_image_list, ncols=6, title=breed)
    if iter_cnt == 4:
        break

### Visualize Images with Applied Augmentation


In [None]:
import albumentations as A

# Be cautious when using crop. Since both humans and dogs appear in the dataset,
# if a person is centered, the crop might cut out only the person.
imsi_augmentor = A.Compose([
    A.Resize(height=224, width=224, p=1), 
    A.CenterCrop(height=200, width=200, p=1),  # A.CenterCrop(height=180, width=180, p=1)
    A.HorizontalFlip(p=0.5),
    A.VerticalFlip(p=0.5),
    A.ShiftScaleRotate(p=0.5),
    A.RandomBrightnessContrast(brightness_limit=(-0.2, 0.2), contrast_limit=(-0.2, 0.2), p=0.5),
    A.HueSaturationValue(hue_shift_limit=0.2, sat_shift_limit=0.2, val_shift_limit=0.2, p=0.5)
])

# To visualize augmented images, apply the image transformation pipeline 
# defined with Albumentations (augmentor).
# Resize the images to 224x224.
def show_grid_images(image_path_list, augmentor=None, ncols=4, title=None):
    figure, axs = plt.subplots(figsize=(22, 4), nrows=1, ncols=ncols)
    for i in range(ncols):
        image = cv2.cvtColor(cv2.imread(image_path_list[i]), cv2.COLOR_BGR2RGB)
        if augmentor is not None:
            image = augmentor(image=image)['image']
        axs[i].imshow(image)
        axs[i].axis('off')
        axs[i].set_title(title) 
        
breed_image_list_01 = data_df[data_df['label']=='Staffordshire_bullterrier']['path'].iloc[:6].tolist()       
show_grid_images(breed_image_list_01, augmentor=None, ncols=6, title='original Staffordshire_bullterrier')
show_grid_images(breed_image_list_01, augmentor=imsi_augmentor, ncols=6, title='augmented Staffordshire_bullterrier')


### Split the Entire DataFrame into Training and Testing DataFrames, Then Further Split the Training DataFrame into Training and Validation Sets

* Use `train_test_split()` to allocate 40% of the total data to the test set.
* Set the `stratify` parameter to ensure an even distribution of breed labels across the splits.


In [None]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(data_df, test_size=0.4, stratify=data_df['label'], random_state=2025)
print(train_df.shape, test_df.shape)

In [None]:
print(train_df['label'].value_counts()/train_df.shape[0])
print(test_df['label'].value_counts()/test_df.shape[0])

In [None]:
tr_df, val_df = train_test_split(train_df, test_size=0.2, stratify=train_df['target'], random_state=2025)

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import cv2

class BreedDataset(Dataset):
    def __init__(self, image_paths, targets=None, transform=None):
        self.image_paths = image_paths
        self.targets = targets
        self.transform = transform
    
    def __len__(self):
        return len(self.image_paths)
        
    def __getitem__(self, idx):
        image_path = self.image_paths[idx]
        image_np = cv2.cvtColor(cv2.imread(image_path), cv2.COLOR_BGR2RGB).astype(np.float32)
        image = self.transform(image=image_np)['image']

        if self.targets is not None:
            target = torch.tensor(self.targets[idx])
            return image, target
        else:
            return image

In [None]:
import albumentations as A
from albumentations.pytorch import ToTensorV2

BATCH_SIZE = 16
class CFG:
    batch_size = 16
    image_size = 224

# Data Augmentation
tr_transform = A.Compose([
    A.Resize(CFG.image_size, CFG.image_size, p=1),
    A.HorizontalFlip(p=0.5),
    A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
    ToTensorV2()
])

val_transform = A.Compose([
    A.Resize(CFG.image_size, CFG.image_size),
    A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
    ToTensorV2()
])

def create_tr_val_loader(tr_df, val_df, tr_transform, val_transform):
    tr_dataset =BreedDataset(image_paths=tr_df['path'].to_list(), 
                               targets=tr_df['target'].to_list(), transform=tr_transform)
    val_dataset = BreedDataset(image_paths=val_df['path'].to_list(), 
                               targets=val_df['target'].to_list(), transform=val_transform)
    
    tr_loader = DataLoader(tr_dataset, batch_size = BATCH_SIZE, shuffle=True, num_workers=4, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=4*BATCH_SIZE, shuffle=False, num_workers=4, pin_memory=True)

    return tr_loader, val_loader

tr_loader, val_loader = create_tr_val_loader(tr_df=tr_df, val_df=val_df, 
                                             tr_transform=tr_transform, val_transform=val_transform)
images, labels = next(iter(tr_loader))
print(images.shape, labels.shape)


### Create torchvision Models

* Implement a model creation function that allows testing with both ResNet101 and the EfficientNet family.


In [None]:
import torch
import torch.nn as nn
from torchvision import models

def create_tv_model(model_name, num_classes=1000):
    model = None
    if model_name == 'efficientnet_v2_s':
        model = models.efficientnet_v2_s(weights='DEFAULT')
        model.classifier = nn.Sequential(nn.Dropout(p=0.2),
                                         nn.Linear(in_features=1280, out_features=num_classes))
    elif model_name == 'efficientnet_b4':
        model = models.efficientnet_b4(weights='DEFAULT')
        model.classifier = nn.Sequential(nn.Dropout(p=0.2),
                                         nn.Linear(in_features=1792, out_features=num_classes))
    elif model_name == 'efficientnet_b1':
        model = models.efficientnet_b1(weights='DEFAULT')
        model.classifier = nn.Sequential(nn.Dropout(p=0.2),
                                         nn.Linear(in_features=1280, out_features=num_classes))
    elif model_name == 'efficientnet_b0':
        model = models.efficientnet_b0(weights='DEFAULT')
        model.classifier = nn.Sequential(nn.Dropout(p=0.2),
                                         nn.Linear(in_features=1280, out_features=num_classes))
    elif model_name == 'resnet101':
        model = models.resnet101(weights='DEFAULT')
        model.fc = nn.Linear(in_features=2048, out_features=num_classes)
        
    return model

eff_model = create_tv_model('efficientnet_b0', num_classes=120)

In [None]:
from torchvision import models

models.resnet101(weights=None)#resnet101, efficientnet_b0, efficientnet_v2_s

In [None]:
models.ResNet101_Weights.IMAGENET1K_V2.transforms() # EffcientNet_B0

### Train Models with the Trainer

* ResNet101 and EfficientNet B0/B1


In [None]:
!rm -rf ./modular/v1
!mkdir -p ./modular/v1
!wget -O ./modular/v1/utils.py https://raw.githubusercontent.com/gayoung-k/cnn-learning-notes/main/utils.py
!ls ./modular/v1

import sys

sys.path.append('/kaggle/working')

from modular.v1.utils import Trainer, ModelCheckpoint, EarlyStopping

In [None]:
from torch.optim import Adam
import torch.optim as optim
import torchmetrics
import timm

CFG.batch_size = 32 # 16
CFG.image_size = 224

# Horizontal_flip
tr_transform_01 = A.Compose([
    A.Resize(CFG.image_size, CFG.image_size, p=1),
    A.HorizontalFlip(p=0.5),
    A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
    ToTensorV2()
])

val_transform = A.Compose([
    A.Resize(CFG.image_size, CFG.image_size),
    A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
    ToTensorV2()
])

def train_breed(model_name, tr_transform, val_transform, learning_rate=1e-3, callbacks=None, epochs=30):
    tr_loader, val_loader = create_tr_val_loader(tr_df=tr_df, val_df=val_df, 
                                             tr_transform=tr_transform, val_transform=val_transform)
    model = create_tv_model(model_name, num_classes=120)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    optimizer = Adam(model.parameters(), lr=learning_rate)
    loss_fn = nn.CrossEntropyLoss()
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
                optimizer=optimizer, mode='min', factor=0.2, patience=3, threshold=0.01, min_lr=1e-7)
    
    trainer = Trainer(model=model, loss_fn=loss_fn, optimizer=optimizer,
                   train_loader=tr_loader, val_loader=val_loader, scheduler=scheduler, 
                   callbacks=callbacks, device=device)
    history = trainer.fit(epochs)
    
    return trainer, history

In [None]:
# Model: ResNet101, apply only HorizontalFlip as augmentation.  
# Learning rate: 1e-4, callbacks: None, epochs: 30.  
trainer, history = train_breed('resnet101', tr_transform_01, val_transform, learning_rate=1e-4, 
                               callbacks=None, epochs=10)


In [None]:
from modular.v1.utils import Predictor

test_image_paths = test_df['path'].to_list()
test_targets = test_df['target'].to_list()

#CFG.batch_size = 16
CFG.image_size = 224

test_transform = A.Compose([
    A.Resize(CFG.image_size, CFG.image_size),
    A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
    ToTensorV2()
])

test_dataset = BreedDataset(image_paths=test_image_paths, 
                            targets=test_targets, transform=test_transform)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=4, pin_memory=True)

trained_model = trainer.get_trained_model()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
predictor = Predictor(model=trained_model, device=device)
eval_metric = predictor.evaluate(test_loader)
print(f'test dataset evaluation:{eval_metric:.4f}')

In [None]:
import os
from PIL import Image
from torchvision import transforms
import cv2
import albumentations as A
from albumentations.pytorch import ToTensorV2

img_dir="/kaggle/input/test-dog-data"
img_names = ["test_dog.jpg", "test_dog2.jpg"]

In [None]:
def predict_single_image(image_path, predictor, transform, class_names):
    image = cv2.imread(image_path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

    transformed = transform(image=image)
    input_tensor = trnasformed['image'].unsqueeze(0).to(predict.device)
    
    predictor.model.eval()
    with torch.no_grad():
        output_predictor.model(input_tensor)
        pred_class = output.argmax(dim=1).item()
    return class_names[pred_class]

#### Check Which Breeds Were Most Frequently Misclassified

* Compare the actual ground truth breeds with the predicted breeds to see how they were misclassified.


In [None]:
# Perform predictions on all test data and return results as a NumPy array
def get_all_predictions(predictor, test_loader):
    preds_all_list = []
    targets_all_list = []
    # (64, 3, 244, 244) -> (64,)
    for images, targets in test_loader:
        preds = predictor.predict(images).cpu().numpy()
        # Use extend() instead of append() to store individual element values in the list
        preds_all_list.extend(preds)
        # targets_all_list.extend(targets.cpu().numpy())

    preds_all = np.array(preds_all_list)

    return preds_all

preds_all = get_all_predictions(predictor, test_loader)

# Save the prediction results into a separate column of test_df
test_df['resnet101_pred'] = preds_all


In [None]:
test_df[test_df['target'] != test_df['resnet101_pred']]

In [None]:
test_df[test_df['target'] != test_df['resnet101_pred']]['label'].value_counts()

In [None]:
test_df[test_df['label'] == 'Eskimo_dog'][['target', 'resnet101_pred']].value_counts()

In [None]:
def show_grid_images(image_path_list, augmentor=None, ncols=4, title=None):
    figure, axs = plt.subplots(figsize=(22, 4), nrows=1, ncols=ncols)
    for i in range(ncols):
        image = cv2.cvtColor(cv2.imread(image_path_list[i]), cv2.COLOR_BGR2RGB)
        image = cv2.resize(image, (224, 224))
        if augmentor is not None:
            image = augmentor(image=image)['image']
        axs[i].imshow(image)
        axs[i].axis('off')
        axs[i].set_title(title) 
        
breed_image_list_01 = data_df[data_df['label']=='Siberian_husky']['path'].iloc[:6].tolist()
breed_image_list_02 = data_df[data_df['label']=='Eskimo_dog']['path'].iloc[:6].tolist()

show_grid_images(breed_image_list_01, ncols=6, title='Siberian_husky')
show_grid_images(breed_image_list_02, ncols=6, title='Eskimo_dog')

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline 

wrong_result_df = test_df[test_df['target'] != test_df['resnet101_pred']]

plt.figure(figsize=(26, 4))
plt.xticks(rotation=90)
sns.countplot(data=wrong_result_df, x='label')

#### Train and Evaluate with EfficientNet-B0

* Train and evaluate the EfficientNet-B0 model using the same batch size, image size, and augmentation


In [None]:
models.EfficientNet_B0_Weights.IMAGENET1K_V1.transforms() # EffcientNet_B0

In [None]:
from torch.optim import Adam
import torch.optim as optim
import torchmetrics
import timm

# Train and evaluate the EfficientNet-B0 model using the same batch size, image size, and augmentation
trainer, history = train_breed('efficientnet_b0', tr_transform_01, val_transform, learning_rate=1e-4, 
                               callbacks=None, epochs=20)

In [None]:
trained_model = trainer.get_trained_model()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
predictor = Predictor(model=trained_model, device=device)
eval_metric = predictor.evaluate(test_loader)
print(f'test dataset evaluation:{eval_metric:.4f}')

#### Train and Evaluate with EfficientNet-B1

* Set the image size to 240


In [None]:
models.EfficientNet_B1_Weights.IMAGENET1K_V2.transforms() # EffcientNet_B0

In [None]:
from torch.optim import Adam
import torch.optim as optim
import torchmetrics
import timm

CFG.batch_size = 32 # 16
CFG.image_size = 240

# Horizontal_flip
tr_transform_01 = A.Compose([
    A.Resize(CFG.image_size, CFG.image_size, p=1),
    A.HorizontalFlip(p=0.5),
    A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
    ToTensorV2()
])

val_transform = A.Compose([
    A.Resize(CFG.image_size, CFG.image_size),
    A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
    ToTensorV2()
])

trainer, history = train_breed('efficientnet_b1', tr_transform_01, val_transform, learning_rate=1e-4, 
                               callbacks=None, epochs=30)

In [None]:
from modular.v1.utils import Predictor

CFG.image_size = 240

test_transform = A.Compose([
    A.Resize(CFG.image_size, CFG.image_size),
    A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
    ToTensorV2()
])

test_dataset = BreedDataset(image_paths=test_image_paths, 
                            targets=test_targets, transform=test_transform)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=4, pin_memory=True)

trained_model = trainer.get_trained_model()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
predictor = Predictor(model=trained_model, device=device)
eval_metric = predictor.evaluate(test_loader)
print(f'test dataset evaluation:{eval_metric:.4f}')

### Train and Evaluate the Model with Various Augmentations

* Applying heavy augmentations does not necessarily improve performance (it may even degrade it).


In [None]:
from torch.optim import Adam
import torch.optim as optim

CFG.batch_size = 32 # 16
CFG.image_size = 240

# Horizontal_flip
from torch.optim import Adam
import torch.optim as optim
import torchmetrics
import timm

CFG.batch_size = 32 # 16
CFG.image_size = 240

tr_transform_02 = A.Compose([
    A.RandomResizedCrop(height=180, width=180, scale=(0.5, 1.0), p=0.3),
    A.HorizontalFlip(p=0.3),
    A.VerticalFlip(p=0.3),
    A.ShiftScaleRotate(p=0.2),
    A.RandomBrightnessContrast(brightness_limit=(-0.2, 0.2), contrast_limit=(-0.2, 0.2), p=0.2),
    A.ColorJitter(p=0.2),
    A.OneOf(
        [A.CoarseDropout(p=1, max_holes=26), 
         A.CLAHE(p=1)
        ], p=0.3), 
    A.Resize(CFG.image_size, CFG.image_size),
    A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
    ToTensorV2()
])

val_transform = A.Compose([
    A.Resize(CFG.image_size, CFG.image_size),
    A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
    ToTensorV2()
])

In [None]:
breed_image_list_01 = data_df[data_df['label']=='Staffordshire_bullterrier']['path'].iloc[:6].tolist()       
show_grid_images(breed_image_list_01, augmentor=None, ncols=6, title='orignal Staffordshire_bullterrier')
show_grid_images(breed_image_list_01, augmentor=tr_transform_02, ncols=6, title='augmented')

In [None]:
trainer, history = train_breed('efficientnet_b1', tr_transform_02, val_transform, learning_rate=1e-4, epochs=20)

In [None]:
from modular.v1.utils import Predictor

test_image_paths = test_df['path'].to_list()
test_targets = test_df['target'].to_list()

CFG.image_size = 240

test_transform = A.Compose([
    A.Resize(CFG.image_size, CFG.image_size),
    A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
    ToTensorV2()
])

test_dataset = BreedDataset(image_paths=test_image_paths, 
                            targets=test_targets, transform=test_transform)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=4, pin_memory=True)

trained_model = trainer.get_trained_model()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
predictor = Predictor(model=trained_model, device=device)
eval_metric = predictor.evaluate(test_loader)
print(f'test dataset evaluation:{eval_metric:.4f}')

#### Train and Evaluate the Model After Applying Different Augmentations


In [None]:
tr_transform_03 = A.Compose([
    A.HorizontalFlip(p=0.3),
    A.ShiftScaleRotate(scale_limit=(0.7, 0.9), p=0.2, rotate_limit=30),
    A.RandomBrightnessContrast(brightness_limit=(-0.2, 0.2), contrast_limit=(-0.2, 0.2), p=0.2),
    A.ColorJitter(p=0.2),
    A.Resize(CFG.image_size, CFG.image_size, p=1),
    A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
    ToTensorV2()
])

In [None]:
breed_image_list_01 = data_df[data_df['label']=='Staffordshire_bullterrier']['path'].iloc[:6].tolist()       
show_grid_images(breed_image_list_01, augmentor=None, ncols=6, title='orignal Staffordshire_bullterrier')
show_grid_images(breed_image_list_01, augmentor=tr_transform_03, ncols=6, title='augmented')

In [None]:
trainer, history = train_breed('efficientnet_b1', tr_transform_03, val_transform, learning_rate=1e-4, epochs=20)

In [None]:
trained_model = trainer.get_trained_model()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
predictor = Predictor(model=trained_model, device=device)
eval_metric = predictor.evaluate(test_loader)
print(f'test dataset evaluation:{eval_metric:.4f}')