# Creating custom Dataset
* 40% of each class (400 for each class)
* train dataset: 75% of class (300)
* test dataset: 25% of class (100)

In [1]:
import sys
import os
sys.path.append(os.path.abspath(".."))  # Go up from notebooks/ to project root

In [2]:
import pathlib

data_dir = pathlib.Path("./data/food-101/images")

# To use ImageFolder class using PyTorch

* customdata/
    * train/
        * pizza/
            * image1.png
            * image300.png
        * steak/
            * image1.png
            * image300.png
    * test/
        * pizza/
            * image1.png
            * image300.png
        * steak/
            * image1.png
            * image2.png

In [3]:
def create_custom_dirs(base_dir):
    """
    Creates custom train and test directories under the given base_dir.
    Returns the paths to train and test directories.
    """
    custom_train_loc = base_dir / "train"
    custom_test_loc = base_dir / "test"
    base_dir.mkdir(exist_ok=True)
    
    custom_train_loc.mkdir(parents=True, exist_ok=True)
    custom_test_loc.mkdir(parents=True, exist_ok=True)
    print(f"created new directory: {custom_test_loc}\n")
    print(f"created new directory: {custom_train_loc}")
    return custom_train_loc, custom_test_loc

# custom_data = pathlib.Path("./customdata")
# custom_train_loc, custom_test_loc = create_custom_dirs(custom_data)
    

In [4]:
from helper.data_setup import get_classnames

classes = get_classnames()
classes[:10]

['apple_pie',
 'baby_back_ribs',
 'baklava',
 'beef_carpaccio',
 'beef_tartare',
 'beet_salad',
 'beignets',
 'bibimbap',
 'bread_pudding',
 'breakfast_burrito']

In [5]:
import random
from pathlib import Path
import shutil
from tqdm import tqdm

def create_custom_data(data_loc: Path, train_loc: Path, test_loc: Path, classes: list, size: int = 40, train_test_split: int = 75) -> None:
    """
    This function is used to create a custom data for using in PyTorch's ImageFolder.
    Function will randomly select the size% of data given byt the user.
    It will then split that data into train and test folders based on the train_test_split value given by the user
    """
    for classname in classes:
        #setup the individual class path
        class_path = Path(data_loc/classname)
        #get all the files from a class_path
        files = list(class_path.rglob("*"))
        #randomly select size% of the files
        random_files = random.sample(files, int((size/100)*len(files)))

        #split the random file to train and test
        train_images = random_files[:int(train_test_split/100 * len(random_files))]
        print(f"train images for {classname}: {len(train_images)}")
        test_images = random_files[int(train_test_split/100 * len(random_files)):]
        print(f"test images for {classname}: {len(test_images)}\n")

        #copy the images to respective folders
        for image in tqdm(train_images, desc=f"    Train [{classname}]", unit="img"):
            parent_folder = image.parent.name
            
            #copy it into train_loc/parent_folder
            dest_path = train_loc/parent_folder
            dest_path.mkdir(parents=True, exist_ok=True)
        
            shutil.copy2(image, dest_path)
        print(f"Completed creating {dest_path}\n")
        
        #copy the test images to respective filder
        for image in tqdm(test_images, desc=f"    Test [{classname}]", unit="img"):
            parent_folder = image.parent.name

            dest_path = test_loc/parent_folder
            dest_path.mkdir(parents=True, exist_ok=True)
            shutil.copy2(image, dest_path)
        print(f"Completed creating custom {dest_path}")
        

# create_custom_data(
#                 data_loc=data_dir,
#                 train_loc=custom_train_loc,
#                 test_loc=custom_test_loc,
#                 classes=classes)

In [6]:
files_in_custom_train = list(custom_train_loc.rglob("*/*"))
files_in_custom_test = list(custom_test_loc.rglob("*/*"))

print(f"total no of train images:{len(files_in_custom_train)}")
print(f"total no of test images:{len(files_in_custom_test)}")
print(f"No of train images per class:{len(list((custom_train_loc / classes[0]).rglob('*')))}")
print(f"No of test images per class:{len(list((custom_test_loc / classes[0]).rglob('*')))}")

NameError: name 'custom_train_loc' is not defined

In [5]:
from helper.data_setup import create_custom_dirs, create_custom_data
from pathlib import Path

pss = Path("./pizza_steak_sushi")

create_custom_dirs(pss)

created new directory: pizza_steak_sushi\test

created new directory: pizza_steak_sushi\train


(WindowsPath('pizza_steak_sushi/train'), WindowsPath('pizza_steak_sushi/test'))

In [6]:
create_custom_data(
    data_loc=data_dir,
    train_loc=Path(pss/"train"),
    test_loc=Path(pss/"test"),
    classes=["pizza","steak","sushi"],
    size=20,
    train_test_split=80)

train images for pizza: 160
test images for pizza: 40



    Train [pizza]: 100%|██████████| 160/160 [00:02<00:00, 66.95img/s]


Completed creating pizza_steak_sushi\train\pizza



    Test [pizza]: 100%|██████████| 40/40 [00:00<00:00, 69.04img/s]


Completed creating custom pizza_steak_sushi\test\pizza
train images for steak: 160
test images for steak: 40



    Train [steak]: 100%|██████████| 160/160 [00:02<00:00, 70.19img/s]


Completed creating pizza_steak_sushi\train\steak



    Test [steak]: 100%|██████████| 40/40 [00:00<00:00, 75.79img/s]


Completed creating custom pizza_steak_sushi\test\steak
train images for sushi: 160
test images for sushi: 40



    Train [sushi]: 100%|██████████| 160/160 [00:02<00:00, 69.74img/s]


Completed creating pizza_steak_sushi\train\sushi



    Test [sushi]: 100%|██████████| 40/40 [00:00<00:00, 73.93img/s]

Completed creating custom pizza_steak_sushi\test\sushi





In [7]:
files = list((pss/"train"/"pizza").rglob("*"))
print(f"train length:{len(files)}")
print(f"test length: {len(list((pss/'test'/'pizza').rglob('*')))}")


train length:224
test length: 64
