In [1]:
# !ls dataset/val/0 | xargs -I {} mv dataset/val/0/{} dataset/train/0/
# !ls dataset/val/1 | xargs -I {} mv dataset/val/1/{} dataset/train/1/

In [11]:
# !ls dataset/train/0 | xargs -I {} cp dataset/train/0/{} dataset/all
# !ls dataset/train/1 | xargs -I {} cp dataset/train/1/{} dataset/all
!ls dataset/test/0 | xargs -I {} cp dataset/test/0/{} dataset/test
!ls dataset/test/1 | xargs -I {} cp dataset/test/1/{} dataset/test

In [12]:
# !rsync --checksum --files-from=<(ls augmented/train/0) augmented/train/0/ final_dataset/train/0/
!rsync --checksum --files-from=<(ls dataset/test/0) dataset/test/0 dataset/test/
!rsync --checksum --files-from=<(ls dataset/test/1) dataset/test/1 dataset/test/

# Preprocessing Functions

In [2]:
import cv2
from torchvision.transforms import v2
from PIL import Image
import os
import numpy as np

def walk_through_dir(dir_path):
  for dirpath, dirnames, filenames in os.walk(dir_path):
    print(f"There are {len(dirnames)} directories and {len(filenames)} images in '{dirpath}'.")

def crop_image_from_gray(img, tol=20):
    if img.ndim == 2:
        mask = img > tol
        return img[np.ix_(mask.any(1),mask.any(0))]
    elif img.ndim == 3:
        gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        mask = gray_img > tol        
        check_shape = img[:,:,0][np.ix_(mask.any(1),mask.any(0))].shape[0]
        if (check_shape == 0):
            return img
        else:
            img1=img[:,:,0][np.ix_(mask.any(1),mask.any(0))]
            img2=img[:,:,1][np.ix_(mask.any(1),mask.any(0))]
            img3=img[:,:,2][np.ix_(mask.any(1),mask.any(0))]
            img = np.stack([img1,img2,img3], axis=-1)
        return img


def circle_crop_v2(img):
    img = crop_image_from_gray(img)

    height, width, depth = img.shape
    largest_side = np.max((height, width))
    img = cv2.resize(img, (largest_side, largest_side))

    height, width, depth = img.shape

    x = int(width / 2)
    y = int(height / 2)
    r = np.amin((x, y))

    circle_img = np.zeros((height, width), np.uint8)
    cv2.circle(circle_img, (x, y), int(r), 1, thickness=-1)
    img = cv2.bitwise_and(img, img, mask=circle_img)
    img = crop_image_from_gray(img)

    return img


def load_ben_color(img, IMG_SIZE=512, sigmaX=7):
    img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
    img = circle_crop_v2(img)
    img = cv2.resize(img,(IMG_SIZE, IMG_SIZE))
    img = cv2.addWeighted (img, 4, cv2.GaussianBlur(img ,(0,0), sigmaX) ,-4 ,128)
        
    return img


# Define CLAHE transformation
def apply_clahe(img):
    lab = cv2.cvtColor(img, cv2.COLOR_RGB2LAB)
    l, a, b = cv2.split(lab)
    clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8, 8))
    cl = clahe.apply(l)
    limg = cv2.merge((cl, a, b))
    img = cv2.cvtColor(limg, cv2.COLOR_LAB2RGB)
    
    return img

# Function to apply augmentation and save the augmented image
def augment_and_save(input_path, output_path, level, replicate, test=False):
    # Define your augmentation transformations
    if test == True:
        transform = v2.Compose([
            v2.RandomHorizontalFlip(),
            v2.RandomVerticalFlip(),
            v2.RandomRotation(90)
        ])
    else:
        if level == 0:
            transform = v2.Compose([
                v2.RandomHorizontalFlip(),
                v2.RandomVerticalFlip(),
                v2.RandomRotation(90)
            ])
        else:
            transform = v2.Compose([
                v2.RandomHorizontalFlip(),
                v2.RandomVerticalFlip(),
                v2.RandomRotation(90),
                v2.RandomPerspective()
            ])
    
    # Set the path to save the augmented image
    # Create a subfolder inside output_path based on the level
    level_folder = os.path.join(output_path, f"{level}")
    os.makedirs(level_folder, exist_ok=True)

    augmented_image_filename = f"{replicate}_{os.path.basename(input_path)}"
    augmented_image_path = os.path.join(level_folder, augmented_image_filename)

    # Check if the augmented image already exists, and skip if it does
    if os.path.exists(augmented_image_path):
        print(f"Skipping existing file: {augmented_image_path}")
        return

    # Load the image as PIL Image
    image = Image.open(input_path)

    # Convert PIL Image to NumPy array
    image_np = np.array(image)

    # Apply the transformations
    image_np = load_ben_color(image_np)
    image_np = apply_clahe(image_np)

    # Convert NumPy array to PIL Image
    image_pil = Image.fromarray(image_np)
    augmented_image = transform(image_pil)

    # Save the augmented image to the subfolder
    augmented_image.save(augmented_image_path)

# Preprocessing Training Images

In [9]:
import os
import pandas as pd

# Read the CSV file
csv_path = 'dataset/train.csv'
df = pd.read_csv(csv_path)

# Set the path to the folder containing your images
image_folder_path = 'dataset/train'

# Set the path to the output folder for augmented images
output_folder_path = 'augmented2/train'
os.makedirs(output_folder_path, exist_ok=True)

print("Processing images...")

# Iterate through the CSV file and apply augmentation
n = 0
for index, row in df.iterrows():
    image_name, level = row['image'], row['level']
    image_path = os.path.join(image_folder_path, f"{image_name}.jpeg")  # Adjust the extension based on your image format

    if level == 0:
        augment_and_save(image_path, output_folder_path, level, replicate=1)
    elif level == 1:
        for i in range(5):
            augment_and_save(image_path, output_folder_path, level, replicate=i)
    elif level == 2:
        for i in range(2):
            augment_and_save(image_path, output_folder_path, level, replicate=i)
    elif level == 3:
        for i in range(13):
            augment_and_save(image_path, output_folder_path, level, replicate=i)
    elif level == 4:
        for i in range(16):
            augment_and_save(image_path, output_folder_path, level, replicate=i)
    
    n += 1
    if n % 500 == 0:
        print(f"Processed {n} images.")
print("Processing complete!")


Processing images...
Skipping existing file: augmented2/train/0/1_10_left.jpeg
Skipping existing file: augmented2/train/0/1_10_right.jpeg
Skipping existing file: augmented2/train/0/1_13_left.jpeg
Skipping existing file: augmented2/train/0/1_13_right.jpeg
Skipping existing file: augmented2/train/1/0_15_left.jpeg
Skipping existing file: augmented2/train/1/1_15_left.jpeg
Skipping existing file: augmented2/train/1/2_15_left.jpeg
Skipping existing file: augmented2/train/1/3_15_left.jpeg
Skipping existing file: augmented2/train/1/4_15_left.jpeg
Skipping existing file: augmented2/train/2/0_15_right.jpeg
Skipping existing file: augmented2/train/2/1_15_right.jpeg
Skipping existing file: augmented2/train/4/0_16_left.jpeg
Skipping existing file: augmented2/train/4/1_16_left.jpeg
Skipping existing file: augmented2/train/4/2_16_left.jpeg
Skipping existing file: augmented2/train/4/3_16_left.jpeg
Skipping existing file: augmented2/train/4/4_16_left.jpeg
Skipping existing file: augmented2/train/4/5_16

KeyboardInterrupt: 

# Preprocessing Test Images

In [3]:
import os
import pandas as pd

# Read the CSV file
csv_path = 'dataset/test.csv'
df = pd.read_csv(csv_path)

# Set the path to the folder containing your images
image_folder_path = 'dataset/test'

# Set the path to the output folder for augmented images
output_folder_path = 'augmented2/test'
os.makedirs(output_folder_path, exist_ok=True)

print("Start Preprocessing...")

# Iterate through the CSV file and apply augmentation
n = 0
for index, row in df.iterrows():
    image_name, level = row['id_code'], row['diagnosis']
    image_path = os.path.join(image_folder_path, f"{image_name}.png")  # Adjust the extension based on your image format

    if level == 0:
        for i in range(1):
            augment_and_save(image_path, output_folder_path, level, replicate=i, test=True)
    elif level == 1:
        for i in range(2): #5
            augment_and_save(image_path, output_folder_path, level, replicate=i, test=True)
    elif level == 2:
        for i in range(1): #2
            augment_and_save(image_path, output_folder_path, level, replicate=i, test=True)
    elif level == 3:
        for i in range(3): #9
            augment_and_save(image_path, output_folder_path, level, replicate=i, test=True)
    elif level == 4:
        for i in range(2): #6
            augment_and_save(image_path, output_folder_path, level, replicate=i, test=True)
    
    # augment_and_save(image_path, output_folder_path, level, replicate=1, test=True)

    n += 1
    if n % 200 == 0:
        print(f"Processed {n} images.")
print("Processing complete!")

Start Preprocessing...
Processed 200 images.
Processed 400 images.
Processed 600 images.
Processed 800 images.
Processed 1000 images.
Processed 1200 images.
Processed 1400 images.
Processed 1600 images.
Processed 1800 images.
Processed 2000 images.
Processed 2200 images.
Processed 2400 images.
Processed 2600 images.
Processed 2800 images.
Processed 3000 images.
Processed 3200 images.
Processed 3400 images.
Processed 3600 images.
Processing complete!


In [6]:
walk_through_dir("dataset")
walk_through_dir("augmented2")
walk_through_dir("final_dataset")

There are 2 directories and 2 images in 'dataset'.
There are 0 directories and 3662 images in 'dataset/test'.
There are 0 directories and 35126 images in 'dataset/train'.
There are 2 directories and 0 images in 'augmented2'.
There are 5 directories and 0 images in 'augmented2/test'.
There are 0 directories and 590 images in 'augmented2/test/4'.
There are 0 directories and 1805 images in 'augmented2/test/0'.
There are 0 directories and 999 images in 'augmented2/test/2'.
There are 0 directories and 740 images in 'augmented2/test/1'.
There are 0 directories and 579 images in 'augmented2/test/3'.
There are 5 directories and 0 images in 'augmented2/train'.
There are 0 directories and 7280 images in 'augmented2/train/4'.
There are 0 directories and 17304 images in 'augmented2/train/0'.
There are 0 directories and 7228 images in 'augmented2/train/2'.
There are 0 directories and 8260 images in 'augmented2/train/1'.
There are 0 directories and 7566 images in 'augmented2/train/3'.
There are 2 di

In [None]:
# !rsync --checksum --files-from=<(ls augmented/train/0) augmented/train/0/ final_dataset/train/0/
# !rsync --checksum --files-from=<(ls augmented/train/1 | shuf -n 7000) augmented/train/1/ final_dataset/train/1/
# !rsync --checksum --files-from=<(ls augmented/train/2 | shuf -n 7000) augmented/train/2/ final_dataset/train/1/
# !rsync --checksum --files-from=<(ls augmented/train/3 | shuf -n 7000) augmented/train/3/ final_dataset/train/1/
# !rsync --checksum --files-from=<(ls augmented/train/4 | shuf -n 7000) augmented/train/4/ final_dataset/train/1/

In [10]:
# !mkdir final_dataset
# !mkdir final_dataset/train
# !mkdir final_dataset/train/0
# !mkdir final_dataset/train/1
# !mkdir final_dataset/train/2
!mkdir final_dataset/test
!mkdir final_dataset/test/0
!mkdir final_dataset/test/1
# !mkdir final_dataset/test/2

In [9]:
!rsync --checksum --files-from=<(ls augmented/train/0) augmented/train/0/ final_dataset/train/0/
!rsync --checksum --files-from=<(ls augmented/train/1) augmented/train/1/ final_dataset/train/1/
!rsync --checksum --files-from=<(ls augmented/train/2) augmented/train/2/ final_dataset/train/1/
!rsync --checksum --files-from=<(ls augmented/train/3) augmented/train/3/ final_dataset/train/1/
!rsync --checksum --files-from=<(ls augmented/train/4) augmented/train/4/ final_dataset/train/1/

In [11]:
!rsync --checksum --files-from=<(ls augmented/test/0) augmented/test/0/ final_dataset/test/0/
!rsync --checksum --files-from=<(ls augmented/test/1) augmented/test/1/ final_dataset/test/1/
!rsync --checksum --files-from=<(ls augmented/test/2) augmented/test/2/ final_dataset/test/1/
!rsync --checksum --files-from=<(ls augmented/test/3) augmented/test/3/ final_dataset/test/1/
!rsync --checksum --files-from=<(ls augmented/test/4) augmented/test/4/ final_dataset/test/1/

In [12]:
walk_through_dir("augmented")
walk_through_dir("final_dataset")

There are 2 directories and 0 images in 'augmented'.
There are 5 directories and 0 images in 'augmented/test'.
There are 0 directories and 295 images in 'augmented/test/4'.
There are 0 directories and 1805 images in 'augmented/test/0'.
There are 0 directories and 999 images in 'augmented/test/2'.
There are 0 directories and 370 images in 'augmented/test/1'.
There are 0 directories and 193 images in 'augmented/test/3'.
There are 5 directories and 0 images in 'augmented/train'.
There are 0 directories and 11328 images in 'augmented/train/4'.
There are 0 directories and 25810 images in 'augmented/train/0'.
There are 0 directories and 10584 images in 'augmented/train/2'.
There are 0 directories and 12215 images in 'augmented/train/1'.
There are 0 directories and 11349 images in 'augmented/train/3'.
There are 2 directories and 0 images in 'final_dataset'.
There are 2 directories and 0 images in 'final_dataset/test'.
There are 0 directories and 1805 images in 'final_dataset/test/0'.
There ar

import os
import torch
import torchvision
from torch.utils.data import DataLoader

def get_mean_std(dataloader):
    # Initialize variables to accumulate channel-wise means and stds
    mean_r, mean_g, mean_b = 0.0, 0.0, 0.0
    std_r, std_g, std_b = 0.0, 0.0, 0.0

    total_samples = 0

    # Iterate through the dataset
    for data in dataloader:
        images, _ = data
        batch_size, channels, height, width = images.size()

        # Calculate mean and std for each channel
        mean_r += torch.sum(images[:, 0, :, :])
        mean_g += torch.sum(images[:, 1, :, :])
        mean_b += torch.sum(images[:, 2, :, :])

        std_r += torch.sum(images[:, 0, :, :] ** 2)
        std_g += torch.sum(images[:, 1, :, :] ** 2)
        std_b += torch.sum(images[:, 2, :, :] ** 2)

        total_samples += batch_size * height * width

    # Calculate overall mean and std
    mean_r /= total_samples
    mean_g /= total_samples
    mean_b /= total_samples

    std_r = torch.sqrt(std_r / total_samples - mean_r ** 2)
    std_g = torch.sqrt(std_g / total_samples - mean_g ** 2)
    std_b = torch.sqrt(std_b / total_samples - mean_b ** 2)

    # Print the results
    print(f"Mean - R: {mean_r}, G: {mean_g}, B: {mean_b}")
    print(f"Std - R: {std_r}, G: {std_g}, B: {std_b}")


NUM_WORKERS = os.cpu_count()

train_data = torchvision.datasets.ImageFolder(root="/home/farzamani/dl/final_dataset/train", transform=torchvision.transforms.ToTensor())
test_data = torchvision.datasets.ImageFolder(root="/home/farzamani/dl/final_dataset/test", transform=torchvision.transforms.ToTensor())

train_dataloader = DataLoader(train_data, num_workers=NUM_WORKERS, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_data, num_workers=NUM_WORKERS, batch_size=32, shuffle=True)

get_mean_std(train_dataloader)
get_mean_std(test_dataloader)