In [4]:
import torch
import torchvision

#assert int(torch.__version__.split(".")[1]) >= 11
#assert int(torchvision.__version__.split(".")[1]) >= 11

import torchvision.datasets as datasets
import torchvision.transforms as transforms
from src.config import PROCESSED_DATA_DIR, RAW_DATA_DIR

# Setup data directory
import pathlib
data_dir = pathlib.Path("../../data").resolve()   
print(data_dir)
print(PROCESSED_DATA_DIR)

[32m2024-06-21 16:02:23.302[0m | [1mINFO    [0m | [36msrc.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: C:\Git\hamburger-hotdog-pizza-classifier[0m


C:\Git\hamburger-hotdog-pizza-classifier\data
C:\Git\hamburger-hotdog-pizza-classifier\data\processed


In [3]:
# Get training data
train_data = datasets.Food101(root=data_dir,
                              split="train",
                              # transform=transforms.ToTensor(),
                              download=True)

# Get testing data
test_data = datasets.Food101(root=data_dir,
                             split="test",
                             # transform=transforms.ToTensor(),
                             download=True)

Downloading https://data.vision.ee.ethz.ch/cvl/food-101.tar.gz to ..\..\data\food-101.tar.gz


100.0%


Extracting ..\..\data\food-101.tar.gz to ..\..\data


In [12]:
# Get random 10% of training images
import random

# Setup data paths
data_path = data_dir / "food-101" / "images"
target_classes = ["pizza", "hot_dog", "hamburger"]

# Change amount of data to get (e.g. 0.1 = random 10%, 0.2 = random 20%)
amount_to_get = 1

# Create function to separate a random amount of data
def get_subset(image_path=data_path,
               data_splits=["train", "test"], 
               target_classes=["pizza", "hot_dog", "hamburger"],
               amount=0.1,
               seed=42):
    random.seed(42)
    label_splits = {}
    
    # Get labels
    for data_split in data_splits:
        print(f"[INFO] Creating image split for: {data_split}...")
        label_path = data_dir / "food-101" / "meta" / f"{data_split}.txt"
        with open(label_path, "r") as f:
            labels = [line.strip("\n") for line in f.readlines() if line.split("/")[0] in target_classes] 
        
        # Get random subset of target classes image ID's
        number_to_sample = round(amount * len(labels))
        print(f"[INFO] Getting random subset of {number_to_sample} images for {data_split}...")
        sampled_images = random.sample(labels, k=number_to_sample)
        
        # Apply full paths
        image_paths = [pathlib.Path(str(image_path / sample_image) + ".jpg") for sample_image in sampled_images]
        label_splits[data_split] = image_paths
    return label_splits
        
label_splits = get_subset(amount=amount_to_get)
label_splits["train"][:10]

[INFO] Creating image split for: train...
[INFO] Getting random subset of 2250 images for train...
[INFO] Creating image split for: test...
[INFO] Getting random subset of 750 images for test...


[WindowsPath('../../data/food-101/images/hamburger/3289634.jpg'),
 WindowsPath('../../data/food-101/images/hamburger/1492254.jpg'),
 WindowsPath('../../data/food-101/images/hot_dog/2848330.jpg'),
 WindowsPath('../../data/food-101/images/hot_dog/2140244.jpg'),
 WindowsPath('../../data/food-101/images/hot_dog/1727984.jpg'),
 WindowsPath('../../data/food-101/images/hamburger/3878886.jpg'),
 WindowsPath('../../data/food-101/images/hamburger/3101158.jpg'),
 WindowsPath('../../data/food-101/images/pizza/937915.jpg'),
 WindowsPath('../../data/food-101/images/hamburger/2726558.jpg'),
 WindowsPath('../../data/food-101/images/pizza/2148129.jpg')]

In [13]:
# Create target directory path
target_dir_name = f"../../data/raw/pizza_hamburger_hotdog_{str(int(amount_to_get*100))}_percent"
print(f"Creating directory: '{target_dir_name}'")

# Setup the directories
target_dir = pathlib.Path(target_dir_name)

# Make the directories
target_dir.mkdir(parents=True, exist_ok=True)

Creating directory: '../../data/raw/pizza_hamburger_hotdog_100_percent'


In [14]:
import shutil

for image_split in label_splits.keys():
    for image_path in label_splits[str(image_split)]:
        dest_dir = target_dir / image_split / image_path.parent.stem / image_path.name
        if not dest_dir.parent.is_dir():
            dest_dir.parent.mkdir(parents=True, exist_ok=True)
        print(f"[INFO] Copying {image_path} to {dest_dir}...")
        shutil.copy2(image_path, dest_dir)

[INFO] Copying ..\..\data\food-101\images\hamburger\3289634.jpg to ..\..\data\raw\pizza_hamburger_hotdog_100_percent\train\hamburger\3289634.jpg...
[INFO] Copying ..\..\data\food-101\images\hamburger\1492254.jpg to ..\..\data\raw\pizza_hamburger_hotdog_100_percent\train\hamburger\1492254.jpg...
[INFO] Copying ..\..\data\food-101\images\hot_dog\2848330.jpg to ..\..\data\raw\pizza_hamburger_hotdog_100_percent\train\hot_dog\2848330.jpg...
[INFO] Copying ..\..\data\food-101\images\hot_dog\2140244.jpg to ..\..\data\raw\pizza_hamburger_hotdog_100_percent\train\hot_dog\2140244.jpg...
[INFO] Copying ..\..\data\food-101\images\hot_dog\1727984.jpg to ..\..\data\raw\pizza_hamburger_hotdog_100_percent\train\hot_dog\1727984.jpg...
[INFO] Copying ..\..\data\food-101\images\hamburger\3878886.jpg to ..\..\data\raw\pizza_hamburger_hotdog_100_percent\train\hamburger\3878886.jpg...
[INFO] Copying ..\..\data\food-101\images\hamburger\3101158.jpg to ..\..\data\raw\pizza_hamburger_hotdog_100_percent\train\h