# Dataset

In [1]:
import glob
import tempfile
import os

import pandas as pd

from monai.data import CacheDataset
from monai.data.utils import partition_dataset_classes

# Prepare data

Process the image paths and labels into a easi to handle format to split the data.

In [18]:
# prepare data function
def image_name(image_file):
    'Extracts the image name from the image path'
    return os.path.basename(image_file).split('.')[0]



def prepare_data(images_dir, labels_df, format='.jpg', test_size=0.2, seed=42):

    # images_dir = 'dataset/siim-isic-melanoma-classification/train'
    # label_data = 'dataset/siim-isic-melanoma-classification/train.csv'

    image_files = sorted(
        glob.glob(os.path.join(images_dir, f'*{format}'))
        
    )

    files_df = pd.read_csv(labels_df)

    image_list  = files_df['image_name'].to_list()
    labels_list = files_df['target'].to_list()
    data_dict   = {k: v for k, v in zip(image_list, labels_list)}
    

    labels_list = [data_dict[image_name(i)] for i in image_files]

    train, val =  partition_dataset_classes(
        image_files, labels_list, ratios=[(1 - test_size), test_size], shuffle=True
        )
    train_dicts = [{'image': i, 'label': data_dict[image_name(i)]} for i in train]
    val_dicts   = [{'image': i, 'label': data_dict[image_name(i)]} for i in val]

    return train_dicts, val_dicts

In [19]:
train, val = prepare_data(
    images_dir='dataset/siim-isic-melanoma-classification/jpeg/train',
    labels_df='dataset/siim-isic-melanoma-classification/train.csv')

In [12]:
train[0]

{'image': 'dataset/siim-isic-melanoma-classification/jpeg/train/ISIC_0246090.jpg',
 'label': 0}

In [22]:
from monai.transforms import (
    AsChannelFirstd,
    Compose,
    CenterSpatialCropd,
    LoadImaged,
    EnsureTyped,
    NormalizeIntensityd,
    Resized,
    ToTensord
)

train_transforms = Compose(
    [   
        LoadImaged('image', image_only=True),
        AsChannelFirstd('image'),
        Resized('image', (256, 256)),
        CenterSpatialCropd('image', 224),
        NormalizeIntensityd('image'),
        ToTensord('image')
    ])

In [23]:
train_data, label_data = prepare_data(
    images_dir='dataset/siim-isic-melanoma-classification/jpeg/train',
    labels_df='dataset/siim-isic-melanoma-classification/train.csv')


train_data = train_data[:500]
label_data = label_data[:500]

train_ds = CacheDataset(
    data=train_data,
    transform=train_transforms,
    num_workers=8,
    cache_rate=0.1
)

Loading dataset: 100%|██████████| 50/50 [00:05<00:00,  9.70it/s]


In [21]:
import torch

train_loader = torch.utils.data.DataLoader(
    train_ds,
    batch_size=2,
    shuffle=True,
    num_workers=8,
    pin_memory=True
)