## Check and Prepare data

### Check Count data

In [13]:
import os

def count_files_in_folder(folder_path):
    try:
        return len([file for file in os.listdir(folder_path)])
    except FileNotFoundError:
        return "Folder not found!"


In [14]:
from itertools import chain
import os

# Function to get relative paths of folders with a prefix
def get_relative_folders_with_prefix(root_folder, prefix="../../"):
    relative_folders = []
    
    for folder_name in os.listdir(root_folder):
        folder_path = os.path.join(root_folder, folder_name)
        if os.path.isdir(folder_path):  # Check if it's a folder
            relative_folders.append(os.path.join(prefix, folder_name))
    
    return relative_folders

# Example usage
root_folder1 = "../../data/raw/train"
prefix1 = f"{root_folder1}/"
folders_with_prefix1 = get_relative_folders_with_prefix(root_folder1, prefix1)

root_folder2 = "../../data/raw/validation"
prefix2 = f"{root_folder2}/"
folders_with_prefix2 = get_relative_folders_with_prefix(root_folder2, prefix2)

root_folder3 = "../../data/raw/test"
prefix3 = f"{root_folder3}/"
folders_with_prefix3 = get_relative_folders_with_prefix(root_folder3, prefix3)

folders_with_prefix = list(chain(folders_with_prefix1, folders_with_prefix2, folders_with_prefix3))

# Print the result
print(folders_with_prefix)


['../../data/raw/train/apple', '../../data/raw/train/cabbage', '../../data/raw/train/carrot', '../../data/raw/train/cucumber', '../../data/raw/train/eggplant', '../../data/raw/train/pear', '../../data/raw/validation/apple', '../../data/raw/validation/cabbage', '../../data/raw/validation/carrot', '../../data/raw/validation/cucumber', '../../data/raw/validation/eggplant', '../../data/raw/validation/pear', '../../data/raw/test/apple', '../../data/raw/test/cabbage', '../../data/raw/test/carrot', '../../data/raw/test/cucumber', '../../data/raw/test/eggplant', '../../data/raw/test/pear']


In [15]:
folder_paths = folders_with_prefix

for folder_path in folder_paths: 
    file_count = count_files_in_folder(folder_path)
    print(f"Number of files in {folder_path}: {file_count}")

Number of files in ../../data/raw/train/apple: 6498
Number of files in ../../data/raw/train/cabbage: 176
Number of files in ../../data/raw/train/carrot: 552
Number of files in ../../data/raw/train/cucumber: 1264
Number of files in ../../data/raw/train/eggplant: 270
Number of files in ../../data/raw/train/pear: 964
Number of files in ../../data/raw/validation/apple: 1625
Number of files in ../../data/raw/validation/cabbage: 45
Number of files in ../../data/raw/validation/carrot: 138
Number of files in ../../data/raw/validation/cucumber: 316
Number of files in ../../data/raw/validation/eggplant: 68
Number of files in ../../data/raw/validation/pear: 242
Number of files in ../../data/raw/test/apple: 1624
Number of files in ../../data/raw/test/cabbage: 44
Number of files in ../../data/raw/test/carrot: 138
Number of files in ../../data/raw/test/cucumber: 316
Number of files in ../../data/raw/test/eggplant: 67
Number of files in ../../data/raw/test/pear: 241


### Check image size

In [16]:
from PIL import Image
import os

# Define the folder path
folder_paths = folders_with_prefix

for folder_path in folder_paths:
    # Initialize variables to track the largest and smallest images
    largest_image = {'name': None, 'size': (0, 0)}
    smallest_image = {'name': None, 'size': (float('inf'), float('inf'))}

    # Read all files in the folder
    file_list = os.listdir(folder_path)
    for file_name in file_list:
        file_path = os.path.join(folder_path, file_name)
        try:
            with Image.open(file_path) as img:
                width, height = img.size  # Get image dimensions
                
                # Check for the largest image
                if width * height > largest_image['size'][0] * largest_image['size'][1]:
                    largest_image = {'name': file_name, 'size': (width, height)}
                
                # Check for the smallest image
                if width * height < smallest_image['size'][0] * smallest_image['size'][1]:
                    smallest_image = {'name': file_name, 'size': (width, height)}
        except Exception as e:
            print(f"Error processing file {file_name}: {e}")

    # Output the results
    print(f'Folder: {folder_path}')
    print(f"Largest Image: {largest_image['name']} with size {largest_image['size'][0]}x{largest_image['size'][1]} pixels")
    print(f"Smallest Image: {smallest_image['name']} with size {smallest_image['size'][0]}x{smallest_image['size'][1]} pixels")
    print()

Folder: ../../data/raw/train/apple
Largest Image: r0_238_7.jpg with size 889x902 pixels
Smallest Image: 10KRA2V6IR4W.jpg with size 256x256 pixels

Folder: ../../data/raw/train/cabbage
Largest Image: r0_78.jpg with size 785x766 pixels
Smallest Image: 1I6VLWT838MY.jpg with size 256x256 pixels

Folder: ../../data/raw/train/carrot
Largest Image: r0_30.jpg with size 217x649 pixels
Smallest Image: 008A1K43RS91.jpg with size 128x128 pixels

Folder: ../../data/raw/train/cucumber
Largest Image: r0_109_1.jpg with size 373x1008 pixels
Smallest Image: 01G2WAX8DVHE.jpg with size 256x256 pixels

Folder: ../../data/raw/train/eggplant
Largest Image: r0_124.jpg with size 203x692 pixels
Smallest Image: r0_44.jpg with size 100x631 pixels

Folder: ../../data/raw/train/pear
Largest Image: r0_73_1.jpg with size 624x746 pixels
Smallest Image: 089AWLOGING7.jpg with size 256x256 pixels

Folder: ../../data/raw/validation/apple
Largest Image: r0_218_7.jpg with size 889x904 pixels
Smallest Image: 10XOKPXTSY0P.jpg

## Data augmentation

### Resize image

In [17]:
from itertools import chain
import os

# Function to get relative paths of folders with a prefix
def get_relative_folders_with_prefix(root_folder, prefix="../../"):
    relative_folders = []
    
    for folder_name in os.listdir(root_folder):
        folder_path = os.path.join(root_folder, folder_name)
        if os.path.isdir(folder_path):  # Check if it's a folder
            relative_folders.append(os.path.join(prefix, folder_name))
    
    return relative_folders

# Example usage
rooter = "../../data"
root_folder1 = "train"
prefix1 = f"{rooter}/processed/resize/{root_folder1}/"
folders_with_prefix1 = get_relative_folders_with_prefix(f"{rooter}/raw/{root_folder1}/", prefix1)

root_folder2 = "validation"
prefix2 = f"{rooter}/processed/resize/{root_folder2}/"
folders_with_prefix2 = get_relative_folders_with_prefix(f"{rooter}/raw/{root_folder2}/", prefix2)

root_folder3 = "test"
prefix3 = f"{rooter}/processed/resize/{root_folder3}/"
folders_with_prefix3 = get_relative_folders_with_prefix(f"{rooter}/raw/{root_folder3}/", prefix3)

folders_with_prefix_aug_re = list(chain(folders_with_prefix1, folders_with_prefix2, folders_with_prefix3))

# Print the result
print(folders_with_prefix_aug_re)


['../../data/processed/resize/train/apple', '../../data/processed/resize/train/cabbage', '../../data/processed/resize/train/carrot', '../../data/processed/resize/train/cucumber', '../../data/processed/resize/train/eggplant', '../../data/processed/resize/train/pear', '../../data/processed/resize/validation/apple', '../../data/processed/resize/validation/cabbage', '../../data/processed/resize/validation/carrot', '../../data/processed/resize/validation/cucumber', '../../data/processed/resize/validation/eggplant', '../../data/processed/resize/validation/pear', '../../data/processed/resize/test/apple', '../../data/processed/resize/test/cabbage', '../../data/processed/resize/test/carrot', '../../data/processed/resize/test/cucumber', '../../data/processed/resize/test/eggplant', '../../data/processed/resize/test/pear']


In [18]:
from PIL import Image
import os

source_folders = folders_with_prefix
target_folders = folders_with_prefix_aug_re

for i in range(len(source_folders)):
    source_folder = source_folders[i]
    target_folder = target_folders[i]

    if not os.path.exists(target_folder):
        os.makedirs(target_folder)

    target_size = (150, 150)  # (Hight, Width)

    for file_name in os.listdir(source_folder):
        source_path = os.path.join(source_folder, file_name)
        target_path = os.path.join(target_folder, file_name)
        
        try:
            with Image.open(source_path) as img:
                # Stretch/Shrink image
                resized_img = img.resize(target_size, Image.Resampling.LANCZOS)
                resized_img.save(target_path)
                # print(f"Resized and saved: {file_name}")
        except Exception as e:
            print(f"Error processing file {file_name}: {e}")

    print(f"All images are resized and saved in the folder '{target_folder}'.")


All images are resized and saved in the folder '../../data/processed/resize/train/apple'.
All images are resized and saved in the folder '../../data/processed/resize/train/cabbage'.
All images are resized and saved in the folder '../../data/processed/resize/train/carrot'.
All images are resized and saved in the folder '../../data/processed/resize/train/cucumber'.
All images are resized and saved in the folder '../../data/processed/resize/train/eggplant'.
All images are resized and saved in the folder '../../data/processed/resize/train/pear'.
All images are resized and saved in the folder '../../data/processed/resize/validation/apple'.
All images are resized and saved in the folder '../../data/processed/resize/validation/cabbage'.
All images are resized and saved in the folder '../../data/processed/resize/validation/carrot'.
All images are resized and saved in the folder '../../data/processed/resize/validation/cucumber'.
All images are resized and saved in the folder '../../data/process

### Adjust image

In [19]:
from itertools import chain
import os

# Function to get relative paths of folders with a prefix
def get_relative_folders_with_prefix(root_folder, prefix="../../"):
    relative_folders = []
    
    for folder_name in os.listdir(root_folder):
        folder_path = os.path.join(root_folder, folder_name)
        if os.path.isdir(folder_path):  # Check if it's a folder
            relative_folders.append(os.path.join(prefix, folder_name))
    
    return relative_folders

# Example usage
rooter = "../../data"
root_folder1 = "train"
prefix1 = f"{rooter}/processed/adjusting/{root_folder1}/"
folders_with_prefix1 = get_relative_folders_with_prefix(f"{rooter}/raw/{root_folder1}/", prefix1)

root_folder2 = "validation"
prefix2 = f"{rooter}/processed/adjusting/{root_folder2}/"
folders_with_prefix2 = get_relative_folders_with_prefix(f"{rooter}/raw/{root_folder2}/", prefix2)

root_folder3 = "test"
prefix3 = f"{rooter}/processed/adjusting/{root_folder3}/"
folders_with_prefix3 = get_relative_folders_with_prefix(f"{rooter}/raw/{root_folder3}/", prefix3)

folders_with_prefix_aug_adj = list(chain(folders_with_prefix1, folders_with_prefix2, folders_with_prefix3))

# Print the result
print(folders_with_prefix_aug_adj)


['../../data/processed/adjusting/train/apple', '../../data/processed/adjusting/train/cabbage', '../../data/processed/adjusting/train/carrot', '../../data/processed/adjusting/train/cucumber', '../../data/processed/adjusting/train/eggplant', '../../data/processed/adjusting/train/pear', '../../data/processed/adjusting/validation/apple', '../../data/processed/adjusting/validation/cabbage', '../../data/processed/adjusting/validation/carrot', '../../data/processed/adjusting/validation/cucumber', '../../data/processed/adjusting/validation/eggplant', '../../data/processed/adjusting/validation/pear', '../../data/processed/adjusting/test/apple', '../../data/processed/adjusting/test/cabbage', '../../data/processed/adjusting/test/carrot', '../../data/processed/adjusting/test/cucumber', '../../data/processed/adjusting/test/eggplant', '../../data/processed/adjusting/test/pear']


In [20]:
import os
import cv2
import albumentations as A

# Paths
source_folders = folders_with_prefix
output_folders = folders_with_prefix_aug_adj

for order in range(len(source_folders)):
    source_folder = source_folders[order]
    output_folder = output_folders[order]

    # Create output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Define complex augmentations with resizing to original dimensions
    def get_augmentation_pipeline(image_width, image_height):
        return A.Compose([
            # A.RandomCrop(width=image_width // 2, height=image_height // 2, p=0.5),    # Random crop, half size
            A.HorizontalFlip(p=0.5),                                                  # Flip horizontally
            A.VerticalFlip(p=0.2),                                                    # Flip vertically
            A.RandomBrightnessContrast(p=0.5),                                        # Adjust brightness/contrast
            A.GaussianBlur(blur_limit=(3, 7), p=0.3),                                 # Apply Gaussian blur
            A.Rotate(limit=45, p=0.5),                                                # Random rotation
            A.RGBShift(r_shift_limit=20, g_shift_limit=20, b_shift_limit=20, p=0.3),  # Shift RGB channels
            A.CLAHE(clip_limit=2, p=0.3),                                             # Adaptive histogram equalization
            A.MultiplicativeNoise(multiplier=(0.9, 1.1), p=0.3),                      # Add noise
            A.Resize(height=image_height, width=image_width, p=1.0)                   # Resize back to original size
        ])

    # Process and augment each image
    for file_name in os.listdir(source_folder):
        source_path = os.path.join(source_folder, file_name)
        
        try:
            # Read the image
            image = cv2.imread(source_path)
            if image is None:
                print(f"Skipping non-image file: {file_name}")
                continue
            
            # Get original dimensions
            image_height, image_width = image.shape[:2]
            
            # Save original image
            original_output_path = os.path.join(output_folder, f"{os.path.splitext(file_name)[0]}_original.jpg")
            cv2.imwrite(original_output_path, image)
            
            # Define the augmentation pipeline with resizing to original dimensions
            augmentation_pipeline = get_augmentation_pipeline(image_width, image_height)
            
            # Generate augmented images
            for i in range(5):  # Number of augmentations per image
                augmented = augmentation_pipeline(image=image)
                augmented_image = augmented["image"]
                
                # Save augmented image
                augmented_output_path = os.path.join(output_folder, f"{os.path.splitext(file_name)[0]}_aug_{i}.jpg")
                cv2.imwrite(augmented_output_path, augmented_image)
                # print(f"Saved augmented image: {augmented_output_path}")
        
        except Exception as e:
            print(f"Error processing file {file_name}: {e}")

    print(f"All images (original + augmentations) are saved in '{output_folder}'.")


All images (original + augmentations) are saved in '../../data/processed/adjusting/train/apple'.
All images (original + augmentations) are saved in '../../data/processed/adjusting/train/cabbage'.
All images (original + augmentations) are saved in '../../data/processed/adjusting/train/carrot'.
All images (original + augmentations) are saved in '../../data/processed/adjusting/train/cucumber'.
All images (original + augmentations) are saved in '../../data/processed/adjusting/train/eggplant'.
All images (original + augmentations) are saved in '../../data/processed/adjusting/train/pear'.
All images (original + augmentations) are saved in '../../data/processed/adjusting/validation/apple'.
All images (original + augmentations) are saved in '../../data/processed/adjusting/validation/cabbage'.
All images (original + augmentations) are saved in '../../data/processed/adjusting/validation/carrot'.
All images (original + augmentations) are saved in '../../data/processed/adjusting/validation/cucumbe

### Recheck image size

In [22]:
# from PIL import Image
# import os

# # Define the folder path
# folder_paths = [
#     "../../data/processed/augmentation/train/cats",
#     "../../data/processed/augmentation/train/dogs",
#     "../../data/processed/augmentation/test/cats",
#     "../../data/processed/augmentation/test/dogs",
# ]

# for folder_path in folder_paths:
#     # Initialize variables to track the largest and smallest images
#     largest_image = {'name': None, 'size': (0, 0)}
#     smallest_image = {'name': None, 'size': (float('inf'), float('inf'))}

#     # Read all files in the folder
#     file_list = os.listdir(folder_path)
#     for file_name in file_list:
#         file_path = os.path.join(folder_path, file_name)
#         try:
#             with Image.open(file_path) as img:
#                 width, height = img.size  # Get image dimensions
                
#                 # Check for the largest image
#                 if width * height > largest_image['size'][0] * largest_image['size'][1]:
#                     largest_image = {'name': file_name, 'size': (width, height)}
                
#                 # Check for the smallest image
#                 if width * height < smallest_image['size'][0] * smallest_image['size'][1]:
#                     smallest_image = {'name': file_name, 'size': (width, height)}
#         except Exception as e:
#             print(f"Error processing file {file_name}: {e}")

#     # Output the results
#     print(f'Folder: {folder_path}')
#     print(f"Largest Image: {largest_image['name']} with size {largest_image['size'][0]}x{largest_image['size'][1]} pixels")
#     print(f"Smallest Image: {smallest_image['name']} with size {smallest_image['size'][0]}x{smallest_image['size'][1]} pixels")
#     print()