In [5]:
import os
import pandas as pd
import shutil
from sklearn.model_selection import train_test_split

# Path to the eyepacs_preprocess directory containing images
image_dir = 'eyepacs_preprocess'

# Path to the CSV file containing image labels
csv_file = 'labels.csv'

# Read the CSV file into a DataFrame
df = pd.read_csv(csv_file)

# Split the data into train and test sets (80:20)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['level'])

# Define the paths for the train and test directories
train_dir = 'train'
test_dir = 'test'

# Automatically create subfolders 'DR' and 'No_DR' within train and test directories
for directory in [train_dir, test_dir]:
    for class_name in ['DR', 'No_DR']:
        os.makedirs(os.path.join(directory, class_name), exist_ok=True)

# Move images to the appropriate subfolders based on their labels
for index, row in train_df.iterrows():
    image_name = row['image'] + '.jpeg'  # Append '.png' to the image name
    label = row['level']
    src_path = os.path.join(image_dir, image_name)
    dest_dir = os.path.join(train_dir, 'DR') if label != 0 else os.path.join(train_dir, 'No_DR')
    shutil.copy(src_path, os.path.join(dest_dir, image_name))

for index, row in test_df.iterrows():
    image_name = row['image'] + '.jpeg'  # Append '.png' to the image name
    label = row['level']
    src_path = os.path.join(image_dir, image_name)
    dest_dir = os.path.join(test_dir, 'DR') if label != 0 else os.path.join(test_dir, 'No_DR')
    shutil.copy(src_path, os.path.join(dest_dir, image_name))

print("Data split and subfolders created successfully.")


Data split and subfolders created successfully.


In [8]:
from PIL import Image
import os

# Path to the directory containing the images
image_dir = 'eyepacs_preprocess'

# Initialize variables to store dimensions of the first image
first_image_width = None
first_image_height = None

# Flag to indicate if all images have the same dimensions
all_images_have_same_dimensions = True

# Loop through all images in the directory
for filename in os.listdir(image_dir):
    if filename.endswith('.jpeg') or filename.endswith('.jpg'):
        # Open the image using Pillow
        image_path = os.path.join(image_dir, filename)
        image = Image.open(image_path)

        # Get the dimensions of the current image
        current_width, current_height = image.size

        # If this is the first image, store its dimensions
        if first_image_width is None and first_image_height is None:
            first_image_width = current_width
            first_image_height = current_height
        else:
            # Compare dimensions with the first image
            if current_width != first_image_width or current_height != first_image_height:
                all_images_have_same_dimensions = False
                break  # No need to check further

# Check the result
if all_images_have_same_dimensions:
    print("All images have the same dimensions.")
    print(f"Dimensions: {first_image_width} x {first_image_height}")
else:
    print("Images have different dimensions.")

Images have different dimensions.


In [9]:
from PIL import Image
import os

# Path to the directory containing the images
image_dir = 'eyepacs_preprocess'

# Create a directory to store resized images
output_dir = 'eyepacs_preprocess_resized'
os.makedirs(output_dir, exist_ok=True)

# Target dimensions
target_width = 224
target_height = 224

# Loop through all images in the directory
for filename in os.listdir(image_dir):
    if filename.endswith('.jpeg') or filename.endswith('.jpg'):
        # Open the image using Pillow
        image_path = os.path.join(image_dir, filename)
        image = Image.open(image_path)

        # Resize the image to the target dimensions
        resized_image = image.resize((target_width, target_height), Image.ANTIALIAS)

        # Save the resized image to the output directory
        output_path = os.path.join(output_dir, filename)
        resized_image.save(output_path)

print("All images have been resized to 224x224 pixels and saved to", output_dir)


  resized_image = image.resize((target_width, target_height), Image.ANTIALIAS)


All images have been resized to 224x224 pixels and saved to eyepacs_preprocess_resized


In [14]:
import os
import random
import shutil

# Paths to the original train and test directories
train_dir = 'train'
test_dir = 'test'

# Create new directories for the smaller balanced train and test sets
small_balanced_train_dir = 'small_balanced_train'
small_balanced_test_dir = 'small_balanced_test'

# Create the smaller balanced train and test directories if they don't exist
os.makedirs(small_balanced_train_dir, exist_ok=True)
os.makedirs(small_balanced_test_dir, exist_ok=True)

# Function to copy images from the source to destination
def copy_images(source_dir, destination_dir, num_images):
    image_files = os.listdir(source_dir)
    random.shuffle(image_files)  # Shuffle the images randomly
    
    for i, image in enumerate(image_files):
        if i >= num_images:
            break  # Stop copying once the desired number is reached
        
        src_path = os.path.join(source_dir, image)
        dest_path = os.path.join(destination_dir, image)
        
        # Ensure that the destination directory exists
        os.makedirs(os.path.dirname(dest_path), exist_ok=True)
        
        shutil.copy(src_path, dest_path)

# Count the minimum number of images between 'DR' and 'No_DR' subfolders in the train dataset
num_images_in_DR_train = len(os.listdir(os.path.join(train_dir, 'DR')))
num_images_in_No_DR_train = len(os.listdir(os.path.join(train_dir, 'No_DR')))
min_images_train = min(num_images_in_DR_train, num_images_in_No_DR_train)

# Count the minimum number of images between 'DR' and 'No_DR' subfolders in the test dataset
num_images_in_DR_test = len(os.listdir(os.path.join(test_dir, 'DR')))
num_images_in_No_DR_test = len(os.listdir(os.path.join(test_dir, 'No_DR')))
min_images_test = min(num_images_in_DR_test, num_images_in_No_DR_test)

# Copy the same number of random images from the 'DR' and 'No_DR' subfolders in both train and test
copy_images(os.path.join(train_dir, 'DR'), os.path.join(small_balanced_train_dir, 'DR'), min_images_train)
copy_images(os.path.join(train_dir, 'No_DR'), os.path.join(small_balanced_train_dir, 'No_DR'), min_images_train)
copy_images(os.path.join(test_dir, 'DR'), os.path.join(small_balanced_test_dir, 'DR'), min_images_test)
copy_images(os.path.join(test_dir, 'No_DR'), os.path.join(small_balanced_test_dir, 'No_DR'), min_images_test)

print(f"{min_images_train} images from each class copied to smaller balanced train dataset.")
print(f"{min_images_test} images from each class copied to smaller balanced test dataset.")


7445 images from each class copied to smaller balanced train dataset.
1861 images from each class copied to smaller balanced test dataset.


In [15]:
import os
import random
import shutil

# Paths to the original train and test directories
train_dir = 'small_balanced_train'  # Updated to use the small balanced train dataset
valid_dir = 'valid'  # New directory for the validation dataset

# Create the validation directory if it doesn't exist
os.makedirs(valid_dir, exist_ok=True)

# Function to copy a specified percentage of random images from the source to destination
def copy_percentage_images(source_dir, destination_dir, percentage):
    image_files = os.listdir(source_dir)
    random.shuffle(image_files)  # Shuffle the images randomly
    num_images = int(len(image_files) * percentage)
    
    for i, image in enumerate(image_files):
        if i >= num_images:
            break  # Stop copying once the desired number is reached
        
        src_path = os.path.join(source_dir, image)
        dest_path = os.path.join(destination_dir, image)
        
        # Ensure that the destination directory exists
        os.makedirs(os.path.dirname(dest_path), exist_ok=True)
        
        shutil.copy(src_path, dest_path)

# Set the desired percentage of training data to be used for validation
validation_percentage = 0.2  # Adjust this percentage as needed

# Copy the specified percentage of random images from both 'DR' and 'No_DR' subfolders in the training dataset
copy_percentage_images(os.path.join(train_dir, 'DR'), os.path.join(valid_dir, 'DR'), validation_percentage)
copy_percentage_images(os.path.join(train_dir, 'No_DR'), os.path.join(valid_dir, 'No_DR'), validation_percentage)

print(f"{validation_percentage * 100}% of training data copied to the validation dataset.")


20.0% of training data copied to the validation dataset.


In [16]:
import os
import random
import shutil

# Paths to the original train and validation directories
train_dir = 'small_balanced_train'  # Updated to use the small balanced train dataset
valid_dir = 'valid'  # New directory for the validation dataset

# Create the validation directory if it doesn't exist
os.makedirs(valid_dir, exist_ok=True)

# Function to move a specified percentage of random images from the source to destination
def move_percentage_images(source_dir, destination_dir, percentage):
    image_files = os.listdir(source_dir)
    random.shuffle(image_files)  # Shuffle the images randomly
    num_images = int(len(image_files) * percentage)
    
    for i, image in enumerate(image_files):
        if i >= num_images:
            break  # Stop moving once the desired number is reached
        
        src_path = os.path.join(source_dir, image)
        dest_path = os.path.join(destination_dir, image)
        
        # Ensure that the destination directory exists
        os.makedirs(os.path.dirname(dest_path), exist_ok=True)
        
        shutil.move(src_path, dest_path)

# Set the desired percentage of training data to be used for validation
validation_percentage = 0.2  # Adjust this percentage as needed

# Move the specified percentage of random images from both 'DR' and 'No_DR' subfolders in the training dataset
move_percentage_images(os.path.join(train_dir, 'DR'), os.path.join(valid_dir, 'DR'), validation_percentage)
move_percentage_images(os.path.join(train_dir, 'No_DR'), os.path.join(valid_dir, 'No_DR'), validation_percentage)

print(f"{validation_percentage * 100}% of training data moved to the validation dataset.")


20.0% of training data moved to the validation dataset.
