### installs and imports

In [1]:
pip install torch torchvision numpy matplotlib Pathlib -qqq


In [3]:
import os
import tarfile
import matplotlib.pyplot as plt
import numpy as np
import requests
import scipy
from PIL import Image
from torch.utils.data import Dataset, Subset, random_split, DataLoader
from torchvision import transforms
from tqdm.auto import tqdm
from pathlib import Path



### Load Dataset from Kaggle

In [4]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("olgabelitskaya/horse-breeds")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'horse-breeds' dataset.
Path to dataset files: /kaggle/input/horse-breeds


### Split the data into train val, and test set (starified)

In [5]:
#path to dataset
dataset_path = Path(path)

In [20]:
image_paths = []
labels = []

for file_name in os.listdir(dataset_path):
    if file_name.endswith(".png"):
        image_paths.append(Path(dataset_path) / file_name)
        labels.append(int(file_name[1]))

print(image_paths[:5])
print(labels[:5])

[PosixPath('/kaggle/input/horse-breeds/01_103.png'), PosixPath('/kaggle/input/horse-breeds/03_068.png'), PosixPath('/kaggle/input/horse-breeds/02_060.png'), PosixPath('/kaggle/input/horse-breeds/03_030.png'), PosixPath('/kaggle/input/horse-breeds/07_065.png')]
[1, 3, 2, 3, 7]


In [None]:
from sklearn.model_selection import train_test_split
train_paths, val_paths, train_labels, val_labels = train_test_split(
    image_paths, labels, test_size=0.2, stratify=labels, random_state=42
)
print(f"training samples: {len(train_paths)}")
print(f"validation samples: {len(val_paths)}")
print(train_labels[:10])
print(val_labels[:10])


training samples: 536
validation samples: 134
[2, 3, 2, 5, 5, 2, 1, 2, 5, 7]
[7, 2, 2, 6, 6, 1, 1, 5, 6, 6]


### Dataset Class

In [26]:
class HorseBreedDataset(Dataset):
    def __init__(self, image_paths, labels, transform=None):
        self.image_paths = image_paths
        self.labels = labels
        self.transform = transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img_path = self.image_paths[idx]
        label = self.labels[idx]
        image = Image.open(img_path).convert("RGB")  # ensure 3 channels

        if self.transform:
            image = self.transform(image)

        return image, label


### Transforms


In [27]:
# Define the mean values for normalization.
mean = [0.485, 0.456, 0.406]
# Define the standard deviation values for normalization.
std = [0.229, 0.224, 0.225]

In [28]:
transform = transforms.Compose([
    # images transforms
    transforms.Resize((256, 256)),  # Resize images to 256x256 pixels
    transforms.CenterCrop(224),  # Center crop to 224x224 pixels
    # bridge to tensor
    transforms.ToTensor(),  # Convert images to PyTorch tensors
    # tensor transforms
    transforms.Normalize(mean=mean, std=std),
])

In [31]:
# Create a new dataset instance with the specified image transformations.
dataset_transformed = HorseBreedDataset(train_paths, train_labels, transform=transform)

In [33]:
import os
from pathlib import Path
import sys


NOTEBOOK_NAME = "C1_M3_Lab_data_management.ipynb" #TODO: automate this
REPO_URL = "https://github.com/SDAIA-Bootcamps/ai-pros-v1-2025.git"

if 'google.colab' in sys.modules:
    print('in colab')
    if os.system('git config --get remote.origin.url') != REPO_URL:
        print('cloning...')
        !git clone $REPO_URL
        repo_dir = REPO_URL.split('/').pop().strip('.git')
        print(f'{repo_dir=}')
        labdir = next(Path(repo_dir).rglob(NOTEBOOK_NAME)).parent.absolute()
        print(f'{labdir=}')
        %cd $labdir

    # path management for Python
    pythonpath, = !echo $PYTHONPATH
    if "." not in pythonpath.split(":"):
        pythonpath = ".:" + pythonpath
        %env PYTHONPATH={pythonpath}
        !echo $PYTHONPATH

# allow "hot-reloading" of modules
# %load_ext autoreload
# %autoreload 2
# needed for inline plots in some contexts
%matplotlib inline


!pwd
%ls

env: PYTHONPATH=.:/env/python
.:/env/python
/content/ai-pros-v1-2025/W4_DL/C1_M3_Data_Management/C1_M3_Lab_data_management
C1_M3_Lab_data_management.ipynb  [0m[01;34mcorrupted_flower_data[0m/  helper_utils.py


In [36]:
pip install directory-tree

Collecting directory-tree
  Downloading directory_tree-1.0.0-py3-none-any.whl.metadata (9.0 kB)
Downloading directory_tree-1.0.0-py3-none-any.whl (23 kB)
Installing collected packages: directory-tree
Successfully installed directory-tree-1.0.0


In [None]:
import helper_utils
# Define an index for a sample to retrieve.
sel_idx = 10

# Retrieve the image and label for the selected index.
img, label = dataset_transformed[sel_idx]
# Retrieve the transformed image and its label using the same index.
img_transformed, label = dataset_transformed[sel_idx]

# quick check
helper_utils.quick_debug(img_transformed)

# Plot the transformed image
helper_utils.plot_img(img_transformed, label=label)

In [None]:
import helper_utils
# Define an index for a sample to retrieve.
sel_idx = 10

# Retrieve the image and label for the selected index.
img, label = path[sel_idx]
# Retrieve the transformed image and its label using the same index.

# quick check
helper_utils.quick_debug(img_transformed)

# Plot the transformed image
helper_utils.plot_img(img, label=label)

TypeError: 'PosixPath' object is not subscriptable

### Create Dataloader objects

#### Display some images

### Define Model 

### define Loss and Optimizer

#### Build one_epoch_training function loop 

### Combine all to train the model
it should Save the best model and track train and val loss and accuracy


### test the model on test set

### show some predictions with the images

### Analyze the results
Is the model overfitting/underfitting?
Plot the training and validation loss/accuracy curves

### Load the model