# Preprocessing the Diabetic Retinopathy Dataset

## Import Libraries

In [None]:
import cv2
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
from PIL import Image
from functools import reduce

## Import Function

Some of this function are taken by this github repository: https://github.com/qqwweee/keras-yolo3/blob/master/yolo3/utils.py

In [None]:
def compose(*funcs):
    if funcs:
        return reduce(lambda f, g: lambda *a, **kw: g(f(*a, **kw)), funcs)
    else:
        raise ValueError('Composition of empty sequence not supported.')

def letterbox_image(image_pil, target_size_wh, padding_color):
    """
    Resizes an image to a target size while maintaining aspect ratio by adding padding.
    The input image (image_pil) is expected to be a PIL Image.
    The padding_color is an integer for grayscale images.
    """
    iw, ih = image_pil.size
    w_target, h_target = target_size_wh

    if iw == 0 or ih == 0: # Handle empty input image
        return Image.new('L', target_size_wh, padding_color)

    scale = min(w_target/iw, h_target/ih)
    nw = int(iw*scale)
    nh = int(ih*scale)

    # Ensure new dimensions are at least 1 pixel if scaled down significantly
    nw = max(1, nw)
    nh = max(1, nh)

    resized_image = image_pil.resize((nw,nh), Image.BICUBIC)
    
    new_image = Image.new('L', target_size_wh, padding_color) # 'L' for grayscale
    new_image.paste(resized_image, ((w_target-nw)//2, (h_target-nh)//2))
    return new_image

## Defining the Preprocessing Pipeline using Function Composition

In [None]:
def convert_to_grayscale(img_bgr):
    # If the image is already grayscale, do nothing
    if len(img_bgr.shape) == 2: return img_bgr
    if img_bgr.shape[2] == 1: return img_bgr.reshape(img_bgr.shape[0], img_bgr.shape[1])
    
    # Specific weights for BGR to Grayscale conversion
    b, g, r = cv2.split(img_bgr)
    gray_img = 0.2989 * r + 0.5870 * g + 0.1140 * b
    return gray_img.astype(np.uint8)

def apply_clahe(img_gray):
    # Create and apply Contrast Limited Adaptive Histogram Equalization (CLAHE)
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
    return clahe.apply(img_gray)

def apply_gaussian_blur(img_gray, kernel_size=(5,5)):
    # Apply Gaussian Blur to the image
    return cv2.GaussianBlur(img_gray, kernel_size, 0)

def apply_median_filter(img_gray, kernel_size=5):
    # Apply Median Filter to the image
    return cv2.medianBlur(img_gray, kernel_size)

def cv2_to_pil_grayscale(img_cv2):
    # Convert a CV2 grayscale image to a PIL image
    return Image.fromarray(img_cv2, mode='L')

def pil_to_cv2_grayscale(img_pil):
    # Convert a PIL grayscale image to a CV2 image (numpy array)
    return np.array(img_pil)

def segment_fundus_and_create_mask(image_cv2_gray, image_name_for_debug=""):
    """
    Segment the eye fundus and return a binary mask and the fundus' bounding box.

    Returns:
        tuple: A tuple containing (mask, bounding_box), where bounding_box is 
               (x, y, w, h) or None if no contour is found.
    """
    # Tuning parameters for segmentation
    blur_kernel_size_seg = (15, 15)  
    threshold_value = 30  
    morph_kernel_size_open = (15, 15)  # Kernel for MORPH_OPEN
    morph_kernel_size_close = (35, 35) # Larger kernel for MORPH_CLOSE to merge regions
    
    blurred_for_seg = cv2.GaussianBlur(image_cv2_gray, blur_kernel_size_seg, 0)
    
    # Try cv2.THRESH_OTSU if a fixed threshold is not robust enough
    # _, thresh_img = cv2.threshold(blurred_for_seg, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    _, thresh_img = cv2.threshold(blurred_for_seg, threshold_value, 255, cv2.THRESH_BINARY)

    kernel_open = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, morph_kernel_size_open)
    kernel_close = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, morph_kernel_size_close)
    
    # Apply morphological operations to clean up the binary mask
    thresh_img = cv2.morphologyEx(thresh_img, cv2.MORPH_OPEN, kernel_open, iterations=1)
    thresh_img = cv2.morphologyEx(thresh_img, cv2.MORPH_CLOSE, kernel_close, iterations=2)

    contours, _ = cv2.findContours(thresh_img, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    
    mask = np.zeros_like(image_cv2_gray)
    bounding_box = None

    if contours:
        contours = sorted(contours, key=cv2.contourArea, reverse=True)
        # Consider filtering out contours that are too small relative to the image area
        # min_area_ratio = 0.05 # Example: the contour must be at least 5% of the image
        # if cv2.contourArea(contours[0]) > image_cv2_gray.shape[0] * image_cv2_gray.shape[1] * min_area_ratio:
        
        fundus_contour = contours[0]
        hull = cv2.convexHull(fundus_contour)
        cv2.drawContours(mask, [hull], -1, (255), thickness=cv2.FILLED)
        bounding_box = cv2.boundingRect(hull) # Returns (x, y, w, h)
        # else:
        #     print(f"Warning: Main contour is too small for {image_name_for_debug}. The resulting image might be black.")
    else:
        print(f"Warning: No fundus contour found for {image_name_for_debug}. The resulting image might be black.")
        
    return mask, bounding_box

In [None]:
# --- Global Parameters ---
FINAL_IMAGE_SIZE = (512, 512)
FUNDUS_TARGET_SCALE_FACTOR = 0.9 # The fundus will occupy 90% of the final image's largest dimension
DEBUG_SAVE_INTERMEDIATE = False # Set to True to save debug images
DEBUG_OUTPUT_DIR = '/home/jupyter-sdm/GENITO/LAVORO_COMPLETO/Dataset_resize/1_IDRiD_DEBUG/'
if DEBUG_SAVE_INTERMEDIATE:
    os.makedirs(DEBUG_OUTPUT_DIR, exist_ok=True)

# 1 IDRiD

## IDRiD Training Set

In [None]:
# --- File and folder paths ---
csv_path = '/home/jupyter-sdm/GENITO/DATASETS/1_IDRiD/a. IDRiD_Disease Grading_Training Labels.csv'
image_dir = '/home/jupyter-sdm/GENITO/DATASETS/1_IDRiD/train'
final_output_dir = '/home/jupyter-sdm/GENITO/LAVORO_COMPLETO/Dataset_resize/1_IDRiD/'
file_extension = ".jpg"
column_class_name = 'Retinopathy grade'
colum_image_name = 'Image name'

try:
    df = pd.read_csv(csv_path)
except FileNotFoundError:
    print(f"Error: CSV file not found at {csv_path}")
    exit()

# Create output directories for each class
classes = df[column_class_name].unique()
for cls in classes:
    class_output_path = os.path.join(final_output_dir, str(cls))
    os.makedirs(class_output_path, exist_ok=True)

error_count = 0
null_bbox_count = 0

for _, row in tqdm(df.iterrows(), total=df.shape[0], desc="Processing Images", ncols=100, ascii=True):
    image_name_no_ext = row[colum_image_name]
    image_name = image_name_no_ext + file_extension # Will be updated if a .jpeg is found
    image_class = str(row[column_class_name])
    src_path = os.path.join(image_dir, image_name_no_ext + file_extension) # Try .jpg first
    output_class_path = os.path.join(final_output_dir, image_class)

    # Handle cases where the image might have a .jpeg extension instead of .jpg
    if not os.path.exists(src_path):
        src_path_jpeg = os.path.join(image_dir, image_name_no_ext + ".jpeg")
        if os.path.exists(src_path_jpeg):
            src_path = src_path_jpeg
            image_name = image_name_no_ext + ".jpeg" # Update the filename for output
        else:
            print(f"Warning: {image_name_no_ext} (with .jpg/.jpeg extensions) not found in {image_dir}")
            error_count += 1
            continue
        
    img_bgr = cv2.imread(src_path, cv2.IMREAD_COLOR)
    
    if img_bgr is None:
        print(f"Error reading {src_path}")
        error_count += 1
        continue
    
    try:
        # 1. Convert to Grayscale
        img_cv2_gray = convert_to_grayscale(img_bgr)

        # 2. Segment the fundus, create the mask, and get the bounding box
        fundus_mask, fundus_bbox = segment_fundus_and_create_mask(img_cv2_gray.copy(), image_name)

        if DEBUG_SAVE_INTERMEDIATE:
            cv2.imwrite(os.path.join(DEBUG_OUTPUT_DIR, f"{image_name_no_ext}_0_gray.png"), img_cv2_gray)
            cv2.imwrite(os.path.join(DEBUG_OUTPUT_DIR, f"{image_name_no_ext}_1_mask.png"), fundus_mask)

        # 3. Apply CLAHE, Gaussian Blur, and Median Filter to the grayscale image
        img_clahe = apply_clahe(img_cv2_gray)
        img_gaussian_blurred = apply_gaussian_blur(img_clahe)
        img_median_filtered = apply_median_filter(img_gaussian_blurred)
        fully_processed_gray_data = img_median_filtered

        if DEBUG_SAVE_INTERMEDIATE:
             cv2.imwrite(os.path.join(DEBUG_OUTPUT_DIR, f"{image_name_no_ext}_2_fully_processed_gray.png"), fully_processed_gray_data)

        # 4. Apply the mask to black out the background of the processed image
        masked_processed_fundus_cv2 = cv2.bitwise_and(fully_processed_gray_data, fully_processed_gray_data, mask=fundus_mask)

        if DEBUG_SAVE_INTERMEDIATE:
            cv2.imwrite(os.path.join(DEBUG_OUTPUT_DIR, f"{image_name_no_ext}_3_masked_fundus.png"), masked_processed_fundus_cv2)

        # --- Fundus Size Normalization ---
        if fundus_bbox:
            x, y, w_bbox, h_bbox = fundus_bbox
            if w_bbox > 0 and h_bbox > 0:
                # Crop the masked fundus using the bounding box
                cropped_fundus_cv2 = masked_processed_fundus_cv2[y:y+h_bbox, x:x+w_bbox]

                if DEBUG_SAVE_INTERMEDIATE:
                    cv2.imwrite(os.path.join(DEBUG_OUTPUT_DIR, f"{image_name_no_ext}_4_cropped_fundus.png"), cropped_fundus_cv2)

                # Calculate the new dimensions for the cropped fundus
                target_max_dim_px = int(max(FINAL_IMAGE_SIZE) * FUNDUS_TARGET_SCALE_FACTOR)
                
                current_max_dim_bbox = max(w_bbox, h_bbox)
                scale_ratio = target_max_dim_px / current_max_dim_bbox if current_max_dim_bbox > 0 else 1
                
                new_w = int(w_bbox * scale_ratio)
                new_h = int(h_bbox * scale_ratio)
                
                # Ensure the new dimensions are at least 1x1
                new_w = max(1, new_w)
                new_h = max(1, new_h)

                interpolation = cv2.INTER_AREA if scale_ratio < 1 else cv2.INTER_CUBIC
                resized_cropped_fundus_cv2 = cv2.resize(cropped_cropped_fundus_cv2, (new_w, new_h), interpolation=interpolation)
                
                # Convert the normalized and resized fundus to a PIL image
                image_to_letterbox_pil = cv2_to_pil_grayscale(resized_cropped_fundus_cv2)

                if DEBUG_SAVE_INTERMEDIATE:
                    image_to_letterbox_pil.save(os.path.join(DEBUG_OUTPUT_DIR, f"{image_name_no_ext}_5_resized_cropped_fundus.png"))
            else:
                # Invalid bounding box (e.g., width or height is 0)
                print(f"Warning: Invalid fundus bounding box for {image_name}. The image will be black.")
                image_to_letterbox_pil = Image.new('L', (1,1), 0) # Small placeholder image to be letterboxed
                null_bbox_count +=1
        else:
            # No bounding box found (no contour)
            print(f"Warning: No fundus bounding box for {image_name}. The image will be black.")
            image_to_letterbox_pil = Image.new('L', (1,1), 0) # Placeholder image
            null_bbox_count +=1
            
        # 5. Letterbox: place the image (now the normalized fundus) into a 512x512 canvas
        #    The padding color is 0 (black) because the fundus background is already black.
        letterboxed_img_pil = letterbox_image(image_to_letterbox_pil, FINAL_IMAGE_SIZE, padding_color=0) 
        
        # 6. Convert back to a NumPy array (CV2) for saving
        final_img_to_save = pil_to_cv2_grayscale(letterboxed_img_pil)
        
        # 7. Save the processed image
        output_filename = os.path.splitext(image_name)[0] + '.png'
        output_path = os.path.join(output_class_path, output_filename)
        cv2.imwrite(output_path, final_img_to_save)
        
    except Exception as e:
        print(f"Error processing {image_name}: {e}")
        import traceback
        traceback.print_exc() # Print the full traceback for easier debugging
        error_count += 1
        continue

print(f"Number of images that could not be read or had processing errors: {error_count}")
print(f"Number of images with null or invalid fundus bounding box: {null_bbox_count}")
print("Splitting and preprocessing complete!")

## IDRiD Test Set

In [None]:
# --- File and folder paths ---
csv_path = '/home/jupyter-sdm/GENITO/DATASETS/1_IDRiD/b. IDRiD_Disease Grading_Testing Labels.csv'
image_dir = '/home/jupyter-sdm/GENITO/DATASETS/1_IDRiD/test'
final_output_dir = '/home/jupyter-sdm/GENITO/LAVORO_COMPLETO/Dataset_resize/1_IDRiD/'

file_extension = ".jpg"
column_class_name = 'Retinopathy grade'
colum_image_name = 'Image name'

try:
    df = pd.read_csv(csv_path)
except FileNotFoundError:
    print(f"Error: CSV file not found at {csv_path}")
    exit()

# Create output directories for each class
classes = df[column_class_name].unique()
for cls in classes:
    class_output_path = os.path.join(final_output_dir, str(cls))
    os.makedirs(class_output_path, exist_ok=True)

error_count = 0
null_bbox_count = 0

for _, row in tqdm(df.iterrows(), total=df.shape[0], desc="Processing Images", ncols=100, ascii=True):
    image_name_no_ext = row[colum_image_name]
    image_name = image_name_no_ext + file_extension # Will be updated if a .jpeg is found
    image_class = str(row[column_class_name])
    src_path = os.path.join(image_dir, image_name_no_ext + file_extension) # Try .jpg first
    output_class_path = os.path.join(final_output_dir, image_class)

    # Handle cases where the image might have a .jpeg extension instead of .jpg
    if not os.path.exists(src_path):
        src_path_jpeg = os.path.join(image_dir, image_name_no_ext + ".jpeg")
        if os.path.exists(src_path_jpeg):
            src_path = src_path_jpeg
            image_name = image_name_no_ext + ".jpeg" # Update the filename for output
        else:
            print(f"Warning: {image_name_no_ext} (with .jpg/.jpeg extensions) not found in {image_dir}")
            error_count += 1
            continue
        
    img_bgr = cv2.imread(src_path, cv2.IMREAD_COLOR)
    
    if img_bgr is None:
        print(f"Error reading {src_path}")
        error_count += 1
        continue
    
    try:
        # 1. Convert to Grayscale
        img_cv2_gray = convert_to_grayscale(img_bgr)

        # 2. Segment the fundus, create the mask, and get the bounding box
        fundus_mask, fundus_bbox = segment_fundus_and_create_mask(img_cv2_gray.copy(), image_name)

        if DEBUG_SAVE_INTERMEDIATE:
            cv2.imwrite(os.path.join(DEBUG_OUTPUT_DIR, f"{image_name_no_ext}_0_gray.png"), img_cv2_gray)
            cv2.imwrite(os.path.join(DEBUG_OUTPUT_DIR, f"{image_name_no_ext}_1_mask.png"), fundus_mask)

        # 3. Apply CLAHE, Gaussian Blur, and Median Filter to the grayscale image
        img_clahe = apply_clahe(img_cv2_gray)
        img_gaussian_blurred = apply_gaussian_blur(img_clahe)
        img_median_filtered = apply_median_filter(img_gaussian_blurred)
        fully_processed_gray_data = img_median_filtered

        if DEBUG_SAVE_INTERMEDIATE:
             cv2.imwrite(os.path.join(DEBUG_OUTPUT_DIR, f"{image_name_no_ext}_2_fully_processed_gray.png"), fully_processed_gray_data)

        # 4. Apply the mask to black out the background of the processed image
        masked_processed_fundus_cv2 = cv2.bitwise_and(fully_processed_gray_data, fully_processed_gray_data, mask=fundus_mask)

        if DEBUG_SAVE_INTERMEDIATE:
            cv2.imwrite(os.path.join(DEBUG_OUTPUT_DIR, f"{image_name_no_ext}_3_masked_fundus.png"), masked_processed_fundus_cv2)

        # --- Fundus Size Normalization ---
        if fundus_bbox:
            x, y, w_bbox, h_bbox = fundus_bbox
            if w_bbox > 0 and h_bbox > 0:
                # Crop the masked fundus using the bounding box
                cropped_fundus_cv2 = masked_processed_fundus_cv2[y:y+h_bbox, x:x+w_bbox]

                if DEBUG_SAVE_INTERMEDIATE:
                    cv2.imwrite(os.path.join(DEBUG_OUTPUT_DIR, f"{image_name_no_ext}_4_cropped_fundus.png"), cropped_fundus_cv2)

                # Calculate the new dimensions for the cropped fundus
                target_max_dim_px = int(max(FINAL_IMAGE_SIZE) * FUNDUS_TARGET_SCALE_FACTOR)
                
                current_max_dim_bbox = max(w_bbox, h_bbox)
                scale_ratio = target_max_dim_px / current_max_dim_bbox if current_max_dim_bbox > 0 else 1
                
                new_w = int(w_bbox * scale_ratio)
                new_h = int(h_bbox * scale_ratio)
                
                # Ensure the new dimensions are at least 1x1
                new_w = max(1, new_w)
                new_h = max(1, new_h)

                interpolation = cv2.INTER_AREA if scale_ratio < 1 else cv2.INTER_CUBIC
                resized_cropped_fundus_cv2 = cv2.resize(cropped_fundus_cv2, (new_w, new_h), interpolation=interpolation)
                
                # Convert the normalized and resized fundus to a PIL image
                image_to_letterbox_pil = cv2_to_pil_grayscale(resized_cropped_fundus_cv2)

                if DEBUG_SAVE_INTERMEDIATE:
                    image_to_letterbox_pil.save(os.path.join(DEBUG_OUTPUT_DIR, f"{image_name_no_ext}_5_resized_cropped_fundus.png"))
            else:
                # Invalid bounding box (e.g., width or height is 0)
                print(f"Warning: Invalid fundus bounding box for {image_name}. The image will be black.")
                image_to_letterbox_pil = Image.new('L', (1,1), 0) # Small placeholder image to be letterboxed
                null_bbox_count +=1
        else:
            # No bounding box found (no contour)
            print(f"Warning: No fundus bounding box for {image_name}. The image will be black.")
            image_to_letterbox_pil = Image.new('L', (1,1), 0) # Placeholder image
            null_bbox_count +=1
            
        # 5. Letterbox: place the image (now the normalized fundus) into a 512x512 canvas
        #    The padding color is 0 (black) because the fundus background is already black.
        letterboxed_img_pil = letterbox_image(image_to_letterbox_pil, FINAL_IMAGE_SIZE, padding_color=0) 
        
        # 6. Convert back to a NumPy array (CV2) for saving
        final_img_to_save = pil_to_cv2_grayscale(letterboxed_img_pil)
        
        # 7. Save the processed image
        output_filename = os.path.splitext(image_name)[0] + '.png'
        output_path = os.path.join(output_class_path, output_filename)
        cv2.imwrite(output_path, final_img_to_save)
        
    except Exception as e:
        print(f"Error processing {image_name}: {e}")
        import traceback
        traceback.print_exc() # Print the full traceback for easier debugging
        error_count += 1
        continue

print(f"Number of images that could not be read or had processing errors: {error_count}")
print(f"Number of images with null or invalid fundus bounding box: {null_bbox_count}")
print("Splitting and preprocessing complete!")

In [None]:
## IDRiD Check
train_images_idrid_base = "/home/jupyter-sdm/GENITO/LAVORO_COMPLETO/Dataset_resize/1_IDRiD"
num_train_images_idrid = sum(len(os.listdir(os.path.join(train_images_idrid_base, str(i)))) for i in range(5))

print(f"Total number of images in the IDRiD train set: {num_train_images_idrid}")

for cls in range(5):
    num_images = len(os.listdir(os.path.join(train_images_idrid_base, str(cls))))
    print(f"Class {cls}: {num_images} images")

# 2 APTOS

In [None]:
# --- File and folder paths ---
csv_path = '/home/jupyter-sdm/GENITO/DATASETS/2_APTOS/train.csv' 
image_dir = '/home/jupyter-sdm/GENITO/DATASETS/2_APTOS/train'
final_output_dir = '/home/jupyter-sdm/GENITO/LAVORO_COMPLETO/Dataset_resize/2_APTOS/'

file_extension = ".png"
column_class_name = 'diagnosis'
colum_image_name = 'id_code'

try:
    df = pd.read_csv(csv_path)
except FileNotFoundError:
    print(f"Error: CSV file not found at {csv_path}")
    exit()

# Create output directories for each class
classes = df[column_class_name].unique()
for cls in classes:
    class_output_path = os.path.join(final_output_dir, str(cls))
    os.makedirs(class_output_path, exist_ok=True)

error_count = 0
null_bbox_count = 0

for _, row in tqdm(df.iterrows(), total=df.shape[0], desc="Processing Images", ncols=100, ascii=True):
    image_name_no_ext = row[colum_image_name]
    image_name = image_name_no_ext + file_extension # Will be updated if a .jpeg is found
    image_class = str(row[column_class_name])
    src_path = os.path.join(image_dir, image_name_no_ext + file_extension) # Try .png first
    output_class_path = os.path.join(final_output_dir, image_class)

    # Handle cases where the image might have a different extension
    if not os.path.exists(src_path):
        src_path_jpeg = os.path.join(image_dir, image_name_no_ext + ".jpeg")
        if os.path.exists(src_path_jpeg):
            src_path = src_path_jpeg
            image_name = image_name_no_ext + ".jpeg" # Update the filename for output
        else:
            print(f"Warning: {image_name_no_ext} (with .png/.jpeg extensions) not found in {image_dir}")
            error_count += 1
            continue
        
    img_bgr = cv2.imread(src_path, cv2.IMREAD_COLOR)
    
    if img_bgr is None:
        print(f"Error reading {src_path}")
        error_count += 1
        continue
    
    try:
        # 1. Convert to Grayscale
        img_cv2_gray = convert_to_grayscale(img_bgr)

        # 2. Segment the fundus, create the mask, and get the bounding box
        fundus_mask, fundus_bbox = segment_fundus_and_create_mask(img_cv2_gray.copy(), image_name)

        if DEBUG_SAVE_INTERMEDIATE:
            cv2.imwrite(os.path.join(DEBUG_OUTPUT_DIR, f"{image_name_no_ext}_0_gray.png"), img_cv2_gray)
            cv2.imwrite(os.path.join(DEBUG_OUTPUT_DIR, f"{image_name_no_ext}_1_mask.png"), fundus_mask)

        # 3. Apply CLAHE, Gaussian Blur, and Median Filter to the grayscale image
        img_clahe = apply_clahe(img_cv2_gray)
        img_gaussian_blurred = apply_gaussian_blur(img_clahe)
        img_median_filtered = apply_median_filter(img_gaussian_blurred)
        fully_processed_gray_data = img_median_filtered

        if DEBUG_SAVE_INTERMEDIATE:
             cv2.imwrite(os.path.join(DEBUG_OUTPUT_DIR, f"{image_name_no_ext}_2_fully_processed_gray.png"), fully_processed_gray_data)

        # 4. Apply the mask to black out the background of the processed image
        masked_processed_fundus_cv2 = cv2.bitwise_and(fully_processed_gray_data, fully_processed_gray_data, mask=fundus_mask)

        if DEBUG_SAVE_INTERMEDIATE:
            cv2.imwrite(os.path.join(DEBUG_OUTPUT_DIR, f"{image_name_no_ext}_3_masked_fundus.png"), masked_processed_fundus_cv2)

        # --- Fundus Size Normalization ---
        if fundus_bbox:
            x, y, w_bbox, h_bbox = fundus_bbox
            if w_bbox > 0 and h_bbox > 0:
                # Crop the masked fundus using the bounding box
                cropped_fundus_cv2 = masked_processed_fundus_cv2[y:y+h_bbox, x:x+w_bbox]

                if DEBUG_SAVE_INTERMEDIATE:
                    cv2.imwrite(os.path.join(DEBUG_OUTPUT_DIR, f"{image_name_no_ext}_4_cropped_fundus.png"), cropped_fundus_cv2)

                # Calculate the new dimensions for the cropped fundus
                target_max_dim_px = int(max(FINAL_IMAGE_SIZE) * FUNDUS_TARGET_SCALE_FACTOR)
                
                current_max_dim_bbox = max(w_bbox, h_bbox)
                scale_ratio = target_max_dim_px / current_max_dim_bbox if current_max_dim_bbox > 0 else 1
                
                new_w = int(w_bbox * scale_ratio)
                new_h = int(h_bbox * scale_ratio)
                
                # Ensure the new dimensions are at least 1x1
                new_w = max(1, new_w)
                new_h = max(1, new_h)

                interpolation = cv2.INTER_AREA if scale_ratio < 1 else cv2.INTER_CUBIC
                resized_cropped_fundus_cv2 = cv2.resize(cropped_fundus_cv2, (new_w, new_h), interpolation=interpolation)
                
                # Convert the normalized and resized fundus to a PIL image
                image_to_letterbox_pil = cv2_to_pil_grayscale(resized_cropped_fundus_cv2)

                if DEBUG_SAVE_INTERMEDIATE:
                    image_to_letterbox_pil.save(os.path.join(DEBUG_OUTPUT_DIR, f"{image_name_no_ext}_5_resized_cropped_fundus.png"))
            else:
                # Invalid bounding box (e.g., width or height is 0)
                print(f"Warning: Invalid fundus bounding box for {image_name}. The image will be black.")
                image_to_letterbox_pil = Image.new('L', (1,1), 0) # Small placeholder image to be letterboxed
                null_bbox_count +=1
        else:
            # No bounding box found (no contour)
            print(f"Warning: No fundus bounding box for {image_name}. The image will be black.")
            image_to_letterbox_pil = Image.new('L', (1,1), 0) # Placeholder image
            null_bbox_count +=1
            
        # 5. Letterbox: place the image (now the normalized fundus) into a 512x512 canvas
        #    The padding color is 0 (black) because the fundus background is already black.
        letterboxed_img_pil = letterbox_image(image_to_letterbox_pil, FINAL_IMAGE_SIZE, padding_color=0) 
        
        # 6. Convert back to a NumPy array (CV2) for saving
        final_img_to_save = pil_to_cv2_grayscale(letterboxed_img_pil)
        
        # 7. Save the processed image
        output_filename = os.path.splitext(image_name)[0] + '.png'
        output_path = os.path.join(output_class_path, output_filename)
        cv2.imwrite(output_path, final_img_to_save)
        
    except Exception as e:
        print(f"Error processing {image_name}: {e}")
        import traceback
        traceback.print_exc() # Print the full traceback for easier debugging
        error_count += 1
        continue

print(f"Number of images that could not be read or had processing errors: {error_count}")
print(f"Number of images with null or invalid fundus bounding box: {null_bbox_count}")
print("Splitting and preprocessing complete!")

In [None]:
## APTOS Check
train_images_aptos_base = "/home/jupyter-sdm/GENITO/LAVORO_COMPLETO/Dataset_resize/2_APTOS"
num_train_images_aptos = sum(len(os.listdir(os.path.join(train_images_aptos_base, str(i)))) for i in range(5))

print(f"Total number of images in the APTOS train set: {num_train_images_aptos}")

for cls in range(5):
    num_images = len(os.listdir(os.path.join(train_images_aptos_base, str(cls))))
    print(f"Class {cls}: {num_images} images")

# 3 DeepDRiD

## DeepDRiD Training Set

In [None]:
# --- File and folder paths ---
csv_path = '/home/jupyter-sdm/GENITO/DATASETS/3_DeepDRiD/regular-fundus-training.csv'
image_dir = '/home/jupyter-sdm/GENITO/DATASETS/3_DeepDRiD/train'
final_output_dir = "/home/jupyter-sdm/GENITO/LAVORO_COMPLETO/Dataset_resize/3_DeepDRiD"

file_extension = ".jpg"
column_class_name = 'patient_DR_Level'
colum_image_name = 'image_id'

try:
    df = pd.read_csv(csv_path)
except FileNotFoundError:
    print(f"Error: CSV file not found at {csv_path}")
    exit()

# Create output directories for each class
classes = df[column_class_name].unique()
for cls in classes:
    class_output_path = os.path.join(final_output_dir, str(cls))
    os.makedirs(class_output_path, exist_ok=True)

error_count = 0
null_bbox_count = 0

for _, row in tqdm(df.iterrows(), total=df.shape[0], desc="Processing Images", ncols=100, ascii=True):
    image_name_no_ext = row[colum_image_name]
    image_name = image_name_no_ext + file_extension # Will be updated if a .jpeg is found
    image_class = str(row[column_class_name])
    src_path = os.path.join(image_dir, image_name_no_ext + file_extension) # Try .jpg first
    output_class_path = os.path.join(final_output_dir, image_class)

    # Handle cases where the image might have a different extension
    if not os.path.exists(src_path):
        src_path_jpeg = os.path.join(image_dir, image_name_no_ext + ".jpeg")
        if os.path.exists(src_path_jpeg):
            src_path = src_path_jpeg
            image_name = image_name_no_ext + ".jpeg" # Update the filename for output
        else:
            print(f"Warning: {image_name_no_ext} (with .jpg/.jpeg extensions) not found in {image_dir}")
            error_count += 1
            continue
        
    img_bgr = cv2.imread(src_path, cv2.IMREAD_COLOR)
    
    if img_bgr is None:
        print(f"Error reading {src_path}")
        error_count += 1
        continue
    
    try:
        # 1. Convert to Grayscale
        img_cv2_gray = convert_to_grayscale(img_bgr)

        # 2. Segment the fundus, create the mask, and get the bounding box
        fundus_mask, fundus_bbox = segment_fundus_and_create_mask(img_cv2_gray.copy(), image_name)

        if DEBUG_SAVE_INTERMEDIATE:
            cv2.imwrite(os.path.join(DEBUG_OUTPUT_DIR, f"{image_name_no_ext}_0_gray.png"), img_cv2_gray)
            cv2.imwrite(os.path.join(DEBUG_OUTPUT_DIR, f"{image_name_no_ext}_1_mask.png"), fundus_mask)

        # 3. Apply CLAHE, Gaussian Blur, and Median Filter to the grayscale image
        img_clahe = apply_clahe(img_cv2_gray)
        img_gaussian_blurred = apply_gaussian_blur(img_clahe)
        img_median_filtered = apply_median_filter(img_gaussian_blurred)
        fully_processed_gray_data = img_median_filtered

        if DEBUG_SAVE_INTERMEDIATE:
             cv2.imwrite(os.path.join(DEBUG_OUTPUT_DIR, f"{image_name_no_ext}_2_fully_processed_gray.png"), fully_processed_gray_data)

        # 4. Apply the mask to black out the background of the processed image
        masked_processed_fundus_cv2 = cv2.bitwise_and(fully_processed_gray_data, fully_processed_gray_data, mask=fundus_mask)

        if DEBUG_SAVE_INTERMEDIATE:
            cv2.imwrite(os.path.join(DEBUG_OUTPUT_DIR, f"{image_name_no_ext}_3_masked_fundus.png"), masked_processed_fundus_cv2)

        # --- Fundus Size Normalization ---
        if fundus_bbox:
            x, y, w_bbox, h_bbox = fundus_bbox
            if w_bbox > 0 and h_bbox > 0:
                # Crop the masked fundus using the bounding box
                cropped_fundus_cv2 = masked_processed_fundus_cv2[y:y+h_bbox, x:x+w_bbox]

                if DEBUG_SAVE_INTERMEDIATE:
                    cv2.imwrite(os.path.join(DEBUG_OUTPUT_DIR, f"{image_name_no_ext}_4_cropped_fundus.png"), cropped_fundus_cv2)

                # Calculate the new dimensions for the cropped fundus
                target_max_dim_px = int(max(FINAL_IMAGE_SIZE) * FUNDUS_TARGET_SCALE_FACTOR)
                
                current_max_dim_bbox = max(w_bbox, h_bbox)
                scale_ratio = target_max_dim_px / current_max_dim_bbox if current_max_dim_bbox > 0 else 1
                
                new_w = int(w_bbox * scale_ratio)
                new_h = int(h_bbox * scale_ratio)
                
                # Ensure the new dimensions are at least 1x1
                new_w = max(1, new_w)
                new_h = max(1, new_h)

                interpolation = cv2.INTER_AREA if scale_ratio < 1 else cv2.INTER_CUBIC
                resized_cropped_fundus_cv2 = cv2.resize(cropped_fundus_cv2, (new_w, new_h), interpolation=interpolation)
                
                # Convert the normalized and resized fundus to a PIL image
                image_to_letterbox_pil = cv2_to_pil_grayscale(resized_cropped_fundus_cv2)

                if DEBUG_SAVE_INTERMEDIATE:
                    image_to_letterbox_pil.save(os.path.join(DEBUG_OUTPUT_DIR, f"{image_name_no_ext}_5_resized_cropped_fundus.png"))
            else:
                # Invalid bounding box (e.g., width or height is 0)
                print(f"Warning: Invalid fundus bounding box for {image_name}. The image will be black.")
                image_to_letterbox_pil = Image.new('L', (1,1), 0) # Small placeholder image to be letterboxed
                null_bbox_count +=1
        else:
            # No bounding box found (no contour)
            print(f"Warning: No fundus bounding box for {image_name}. The image will be black.")
            image_to_letterbox_pil = Image.new('L', (1,1), 0) # Placeholder image
            null_bbox_count +=1
            
        # 5. Letterbox: place the image (now the normalized fundus) into a 512x512 canvas
        #    The padding color is 0 (black) because the fundus background is already black.
        letterboxed_img_pil = letterbox_image(image_to_letterbox_pil, FINAL_IMAGE_SIZE, padding_color=0) 
        
        # 6. Convert back to a NumPy array (CV2) for saving
        final_img_to_save = pil_to_cv2_grayscale(letterboxed_img_pil)
        
        # 7. Save the processed image
        output_filename = os.path.splitext(image_name)[0] + '.png'
        output_path = os.path.join(output_class_path, output_filename)
        cv2.imwrite(output_path, final_img_to_save)
        
    except Exception as e:
        print(f"Error processing {image_name}: {e}")
        import traceback
        traceback.print_exc() # Print the full traceback for easier debugging
        error_count += 1
        continue

print(f"Number of images that could not be read or had processing errors: {error_count}")
print(f"Number of images with null or invalid fundus bounding box: {null_bbox_count}")
print("Splitting and preprocessing complete!")

## DeepDRiD Validation Set

In [None]:
# --- File and folder paths ---
csv_path = '/home/jupyter-sdm/GENITO/DATASETS/3_DeepDRiD/regular-fundus-validation.csv'
image_dir = '/home/jupyter-sdm/GENITO/DATASETS/3_DeepDRiD/Validation'
final_output_dir = "/home/jupyter-sdm/GENITO/LAVORO_COMPLETO/Dataset_resize/3_DeepDRiD"

file_extension = ".jpg"
column_class_name = 'patient_DR_Level'
colum_image_name = 'image_id'

try:
    df = pd.read_csv(csv_path)
except FileNotFoundError:
    print(f"Error: CSV file not found at {csv_path}")
    exit()

# Create output directories for each class
classes = df[column_class_name].unique()
for cls in classes:
    class_output_path = os.path.join(final_output_dir, str(cls))
    os.makedirs(class_output_path, exist_ok=True)

error_count = 0
null_bbox_count = 0

for _, row in tqdm(df.iterrows(), total=df.shape[0], desc="Processing Images", ncols=100, ascii=True):
    image_name_no_ext = row[colum_image_name]
    image_name = image_name_no_ext + file_extension # Will be updated if a .jpeg is found
    image_class = str(row[column_class_name])
    src_path = os.path.join(image_dir, image_name_no_ext + file_extension) # Try .jpg first
    output_class_path = os.path.join(final_output_dir, image_class)

    # Handle cases where the image might have a different extension
    if not os.path.exists(src_path):
        src_path_jpeg = os.path.join(image_dir, image_name_no_ext + ".jpeg")
        if os.path.exists(src_path_jpeg):
            src_path = src_path_jpeg
            image_name = image_name_no_ext + ".jpeg" # Update the filename for output
        else:
            print(f"Warning: {image_name_no_ext} (with .jpg/.jpeg extensions) not found in {image_dir}")
            error_count += 1
            continue
        
    img_bgr = cv2.imread(src_path, cv2.IMREAD_COLOR)
    
    if img_bgr is None:
        print(f"Error reading {src_path}")
        error_count += 1
        continue
    
    try:
        # 1. Convert to Grayscale
        img_cv2_gray = convert_to_grayscale(img_bgr)

        # 2. Segment the fundus, create the mask, and get the bounding box
        fundus_mask, fundus_bbox = segment_fundus_and_create_mask(img_cv2_gray.copy(), image_name)

        if DEBUG_SAVE_INTERMEDIATE:
            cv2.imwrite(os.path.join(DEBUG_OUTPUT_DIR, f"{image_name_no_ext}_0_gray.png"), img_cv2_gray)
            cv2.imwrite(os.path.join(DEBUG_OUTPUT_DIR, f"{image_name_no_ext}_1_mask.png"), fundus_mask)

        # 3. Apply CLAHE, Gaussian Blur, and Median Filter to the grayscale image
        img_clahe = apply_clahe(img_cv2_gray)
        img_gaussian_blurred = apply_gaussian_blur(img_clahe)
        img_median_filtered = apply_median_filter(img_gaussian_blurred)
        fully_processed_gray_data = img_median_filtered

        if DEBUG_SAVE_INTERMEDIATE:
             cv2.imwrite(os.path.join(DEBUG_OUTPUT_DIR, f"{image_name_no_ext}_2_fully_processed_gray.png"), fully_processed_gray_data)

        # 4. Apply the mask to black out the background of the processed image
        masked_processed_fundus_cv2 = cv2.bitwise_and(fully_processed_gray_data, fully_processed_gray_data, mask=fundus_mask)

        if DEBUG_SAVE_INTERMEDIATE:
            cv2.imwrite(os.path.join(DEBUG_OUTPUT_DIR, f"{image_name_no_ext}_3_masked_fundus.png"), masked_processed_fundus_cv2)

        # --- Fundus Size Normalization ---
        if fundus_bbox:
            x, y, w_bbox, h_bbox = fundus_bbox
            if w_bbox > 0 and h_bbox > 0:
                # Crop the masked fundus using the bounding box
                cropped_fundus_cv2 = masked_processed_fundus_cv2[y:y+h_bbox, x:x+w_bbox]

                if DEBUG_SAVE_INTERMEDIATE:
                    cv2.imwrite(os.path.join(DEBUG_OUTPUT_DIR, f"{image_name_no_ext}_4_cropped_fundus.png"), cropped_fundus_cv2)

                # Calculate the new dimensions for the cropped fundus
                target_max_dim_px = int(max(FINAL_IMAGE_SIZE) * FUNDUS_TARGET_SCALE_FACTOR)
                
                current_max_dim_bbox = max(w_bbox, h_bbox)
                scale_ratio = target_max_dim_px / current_max_dim_bbox if current_max_dim_bbox > 0 else 1
                
                new_w = int(w_bbox * scale_ratio)
                new_h = int(h_bbox * scale_ratio)
                
                # Ensure the new dimensions are at least 1x1
                new_w = max(1, new_w)
                new_h = max(1, new_h)

                interpolation = cv2.INTER_AREA if scale_ratio < 1 else cv2.INTER_CUBIC
                resized_cropped_fundus_cv2 = cv2.resize(cropped_fundus_cv2, (new_w, new_h), interpolation=interpolation)
                
                # Convert the normalized and resized fundus to a PIL image
                image_to_letterbox_pil = cv2_to_pil_grayscale(resized_cropped_fundus_cv2)

                if DEBUG_SAVE_INTERMEDIATE:
                    image_to_letterbox_pil.save(os.path.join(DEBUG_OUTPUT_DIR, f"{image_name_no_ext}_5_resized_cropped_fundus.png"))
            else:
                # Invalid bounding box (e.g., width or height is 0)
                print(f"Warning: Invalid fundus bounding box for {image_name}. The image will be black.")
                image_to_letterbox_pil = Image.new('L', (1,1), 0) # Small placeholder image to be letterboxed
                null_bbox_count +=1
        else:
            # No bounding box found (no contour)
            print(f"Warning: No fundus bounding box for {image_name}. The image will be black.")
            image_to_letterbox_pil = Image.new('L', (1,1), 0) # Placeholder image
            null_bbox_count +=1
            
        # 5. Letterbox: place the image (now the normalized fundus) into a 512x512 canvas
        #    The padding color is 0 (black) because the fundus background is already black.
        letterboxed_img_pil = letterbox_image(image_to_letterbox_pil, FINAL_IMAGE_SIZE, padding_color=0) 
        
        # 6. Convert back to a NumPy array (CV2) for saving
        final_img_to_save = pil_to_cv2_grayscale(letterboxed_img_pil)
        
        # 7. Save the processed image
        output_filename = os.path.splitext(image_name)[0] + '.png'
        output_path = os.path.join(output_class_path, output_filename)
        cv2.imwrite(output_path, final_img_to_save)
        
    except Exception as e:
        print(f"Error processing {image_name}: {e}")
        import traceback
        traceback.print_exc() # Print the full traceback for easier debugging
        error_count += 1
        continue

print(f"Number of images that could not be read or had processing errors: {error_count}")
print(f"Number of images with null or invalid fundus bounding box: {null_bbox_count}")
print("Splitting and preprocessing complete!")

## DeepDRiD Test Set

In [None]:
# --- File and folder paths ---
csv_path = '/home/jupyter-sdm/GENITO/DATASETS/3_DeepDRiD/test1.csv'
image_dir = '/home/jupyter-sdm/GENITO/DATASETS/3_DeepDRiD/test'
final_output_dir = "/home/jupyter-sdm/GENITO/LAVORO_COMPLETO/Dataset_resize/3_DeepDRiD"

file_extension = ".jpg"
column_class_name = 'DR_Levels'
colum_image_name = 'image_id'

try:
    df = pd.read_csv(csv_path)
except FileNotFoundError:
    print(f"Error: CSV file not found at {csv_path}")
    exit()

# Create output directories for each class
classes = df[column_class_name].unique()
for cls in classes:
    class_output_path = os.path.join(final_output_dir, str(cls))
    os.makedirs(class_output_path, exist_ok=True)

error_count = 0
null_bbox_count = 0

for _, row in tqdm(df.iterrows(), total=df.shape[0], desc="Processing Images", ncols=100, ascii=True):
    image_name_no_ext = row[colum_image_name]
    image_name = image_name_no_ext + file_extension # Will be updated if a .jpeg is found
    image_class = str(row[column_class_name])
    src_path = os.path.join(image_dir, image_name_no_ext + file_extension) # Try .jpg first
    output_class_path = os.path.join(final_output_dir, image_class)

    # Handle cases where the image might have a different extension
    if not os.path.exists(src_path):
        src_path_jpeg = os.path.join(image_dir, image_name_no_ext + ".jpeg")
        if os.path.exists(src_path_jpeg):
            src_path = src_path_jpeg
            image_name = image_name_no_ext + ".jpeg" # Update the filename for output
        else:
            print(f"Warning: {image_name_no_ext} (with .jpg/.jpeg extensions) not found in {image_dir}")
            error_count += 1
            continue
        
    img_bgr = cv2.imread(src_path, cv2.IMREAD_COLOR)
    
    if img_bgr is None:
        print(f"Error reading {src_path}")
        error_count += 1
        continue
    
    try:
        # 1. Convert to Grayscale
        img_cv2_gray = convert_to_grayscale(img_bgr)

        # 2. Segment the fundus, create the mask, and get the bounding box
        fundus_mask, fundus_bbox = segment_fundus_and_create_mask(img_cv2_gray.copy(), image_name)

        if DEBUG_SAVE_INTERMEDIATE:
            cv2.imwrite(os.path.join(DEBUG_OUTPUT_DIR, f"{image_name_no_ext}_0_gray.png"), img_cv2_gray)
            cv2.imwrite(os.path.join(DEBUG_OUTPUT_DIR, f"{image_name_no_ext}_1_mask.png"), fundus_mask)

        # 3. Apply CLAHE, Gaussian Blur, and Median Filter to the grayscale image
        img_clahe = apply_clahe(img_cv2_gray)
        img_gaussian_blurred = apply_gaussian_blur(img_clahe)
        img_median_filtered = apply_median_filter(img_gaussian_blurred)
        fully_processed_gray_data = img_median_filtered

        if DEBUG_SAVE_INTERMEDIATE:
             cv2.imwrite(os.path.join(DEBUG_OUTPUT_DIR, f"{image_name_no_ext}_2_fully_processed_gray.png"), fully_processed_gray_data)

        # 4. Apply the mask to black out the background of the processed image
        masked_processed_fundus_cv2 = cv2.bitwise_and(fully_processed_gray_data, fully_processed_gray_data, mask=fundus_mask)

        if DEBUG_SAVE_INTERMEDIATE:
            cv2.imwrite(os.path.join(DEBUG_OUTPUT_DIR, f"{image_name_no_ext}_3_masked_fundus.png"), masked_processed_fundus_cv2)

        # --- Fundus Size Normalization ---
        if fundus_bbox:
            x, y, w_bbox, h_bbox = fundus_bbox
            if w_bbox > 0 and h_bbox > 0:
                # Crop the masked fundus using the bounding box
                cropped_fundus_cv2 = masked_processed_fundus_cv2[y:y+h_bbox, x:x+w_bbox]

                if DEBUG_SAVE_INTERMEDIATE:
                    cv2.imwrite(os.path.join(DEBUG_OUTPUT_DIR, f"{image_name_no_ext}_4_cropped_fundus.png"), cropped_fundus_cv2)

                # Calculate the new dimensions for the cropped fundus
                target_max_dim_px = int(max(FINAL_IMAGE_SIZE) * FUNDUS_TARGET_SCALE_FACTOR)
                
                current_max_dim_bbox = max(w_bbox, h_bbox)
                scale_ratio = target_max_dim_px / current_max_dim_bbox if current_max_dim_bbox > 0 else 1
                
                new_w = int(w_bbox * scale_ratio)
                new_h = int(h_bbox * scale_ratio)
                
                # Ensure the new dimensions are at least 1x1
                new_w = max(1, new_w)
                new_h = max(1, new_h)

                interpolation = cv2.INTER_AREA if scale_ratio < 1 else cv2.INTER_CUBIC
                resized_cropped_fundus_cv2 = cv2.resize(cropped_fundus_cv2, (new_w, new_h), interpolation=interpolation)
                
                # Convert the normalized and resized fundus to a PIL image
                image_to_letterbox_pil = cv2_to_pil_grayscale(resized_cropped_fundus_cv2)

                if DEBUG_SAVE_INTERMEDIATE:
                    image_to_letterbox_pil.save(os.path.join(DEBUG_OUTPUT_DIR, f"{image_name_no_ext}_5_resized_cropped_fundus.png"))
            else:
                # Invalid bounding box (e.g., width or height is 0)
                print(f"Warning: Invalid fundus bounding box for {image_name}. The image will be black.")
                image_to_letterbox_pil = Image.new('L', (1,1), 0) # Small placeholder image to be letterboxed
                null_bbox_count +=1
        else:
            # No bounding box found (no contour)
            print(f"Warning: No fundus bounding box for {image_name}. The image will be black.")
            image_to_letterbox_pil = Image.new('L', (1,1), 0) # Placeholder image
            null_bbox_count +=1
            
        # 5. Letterbox: place the image (now the normalized fundus) into a 512x512 canvas
        #    The padding color is 0 (black) because the fundus background is already black.
        letterboxed_img_pil = letterbox_image(image_to_letterbox_pil, FINAL_IMAGE_SIZE, padding_color=0) 
        
        # 6. Convert back to a NumPy array (CV2) for saving
        final_img_to_save = pil_to_cv2_grayscale(letterboxed_img_pil)
        
        # 7. Save the processed image
        output_filename = os.path.splitext(image_name)[0] + '.png'
        output_path = os.path.join(output_class_path, output_filename)
        cv2.imwrite(output_path, final_img_to_save)
        
    except Exception as e:
        print(f"Error processing {image_name}: {e}")
        import traceback
        traceback.print_exc() # Print the full traceback for easier debugging
        error_count += 1
        continue

print(f"Number of images that could not be read or had processing errors: {error_count}")
print(f"Number of images with null or invalid fundus bounding box: {null_bbox_count}")
print("Splitting and preprocessing complete!")

In [None]:
## DeepDrid Check
deepdrid_base_path = "/home/jupyter-sdm/GENITO/LAVORO_COMPLETO/Dataset_resize/3_DeepDRiD"
total_images_deepdrid = sum(len(os.listdir(os.path.join(deepdrid_base_path, str(i)))) for i in range(5))

print(f"Total number of images in the DeepDrid dataset: {total_images_deepdrid}")

for cls in range(5):
    num_images = len(os.listdir(os.path.join(deepdrid_base_path, str(cls))))
    print(f"Class {cls}: {num_images} images")

# 4 Messidor2

## Messidor2 Training Set 

In [None]:
# --- File and folder paths ---
csv_path = '/home/jupyter-sdm/GENITO/DATASETS/4_MESSIDOR2/messidor_data.csv'
image_dir = '/home/jupyter-sdm/GENITO/DATASETS/4_MESSIDOR2/images'
final_output_dir = "/home/jupyter-sdm/GENITO/LAVORO_COMPLETO/Dataset_resize/4_Messidor2/"

column_class_name = 'diagnosis'
colum_image_name = 'id_code'

try:
    df = pd.read_csv(csv_path)
except FileNotFoundError:
    print(f"Error: CSV file not found at {csv_path}")
    exit()

# Create output directories for each class
classes = df[column_class_name].unique()
for cls in classes:
    class_output_path = os.path.join(final_output_dir, str(cls))
    os.makedirs(class_output_path, exist_ok=True)

error_count = 0
null_bbox_count = 0

for _, row in tqdm(df.iterrows(), total=df.shape[0], desc="Processing Images", ncols=100, ascii=True):
    # Assuming 'id_code' in the CSV contains the full filename with extension
    image_filename = row[colum_image_name] 
    image_class = str(row[column_class_name])
    src_path = os.path.join(image_dir, image_filename)
    output_class_path = os.path.join(final_output_dir, image_class)

    # Fallback check if the primary file path doesn't exist
    if not os.path.exists(src_path):
        # This fallback logic might be specific to the dataset's structure
        src_path_jpeg = os.path.join(image_dir, image_filename + ".jpeg")
        if os.path.exists(src_path_jpeg):
            src_path = src_path_jpeg
            image_filename = image_filename + ".jpeg" # Update the filename for output
        else:
            print(f"Warning: {image_filename} not found in {image_dir}")
            error_count += 1
            continue
        
    img_bgr = cv2.imread(src_path, cv2.IMREAD_COLOR)
    
    if img_bgr is None:
        print(f"Error reading {src_path}")
        error_count += 1
        continue
    
    try:
        # 1. Convert to Grayscale
        img_cv2_gray = convert_to_grayscale(img_bgr)

        # 2. Segment the fundus, create the mask, and get the bounding box
        fundus_mask, fundus_bbox = segment_fundus_and_create_mask(img_cv2_gray.copy(), image_filename)

        if DEBUG_SAVE_INTERMEDIATE:
            debug_name = os.path.splitext(image_filename)[0]
            cv2.imwrite(os.path.join(DEBUG_OUTPUT_DIR, f"{debug_name}_0_gray.png"), img_cv2_gray)
            cv2.imwrite(os.path.join(DEBUG_OUTPUT_DIR, f"{debug_name}_1_mask.png"), fundus_mask)

        # 3. Apply CLAHE, Gaussian Blur, and Median Filter to the grayscale image
        img_clahe = apply_clahe(img_cv2_gray)
        img_gaussian_blurred = apply_gaussian_blur(img_clahe)
        img_median_filtered = apply_median_filter(img_gaussian_blurred)
        fully_processed_gray_data = img_median_filtered

        if DEBUG_SAVE_INTERMEDIATE:
             cv2.imwrite(os.path.join(DEBUG_OUTPUT_DIR, f"{debug_name}_2_fully_processed_gray.png"), fully_processed_gray_data)

        # 4. Apply the mask to black out the background of the processed image
        masked_processed_fundus_cv2 = cv2.bitwise_and(fully_processed_gray_data, fully_processed_gray_data, mask=fundus_mask)

        if DEBUG_SAVE_INTERMEDIATE:
            cv2.imwrite(os.path.join(DEBUG_OUTPUT_DIR, f"{debug_name}_3_masked_fundus.png"), masked_processed_fundus_cv2)

        # --- Fundus Size Normalization ---
        if fundus_bbox:
            x, y, w_bbox, h_bbox = fundus_bbox
            if w_bbox > 0 and h_bbox > 0:
                # Crop the masked fundus using the bounding box
                cropped_fundus_cv2 = masked_processed_fundus_cv2[y:y+h_bbox, x:x+w_bbox]

                if DEBUG_SAVE_INTERMEDIATE:
                    cv2.imwrite(os.path.join(DEBUG_OUTPUT_DIR, f"{debug_name}_4_cropped_fundus.png"), cropped_fundus_cv2)

                # Calculate the new dimensions for the cropped fundus
                target_max_dim_px = int(max(FINAL_IMAGE_SIZE) * FUNDUS_TARGET_SCALE_FACTOR)
                
                current_max_dim_bbox = max(w_bbox, h_bbox)
                scale_ratio = target_max_dim_px / current_max_dim_bbox if current_max_dim_bbox > 0 else 1
                
                new_w = int(w_bbox * scale_ratio)
                new_h = int(h_bbox * scale_ratio)
                
                # Ensure the new dimensions are at least 1x1
                new_w = max(1, new_w)
                new_h = max(1, new_h)

                interpolation = cv2.INTER_AREA if scale_ratio < 1 else cv2.INTER_CUBIC
                resized_cropped_fundus_cv2 = cv2.resize(cropped_fundus_cv2, (new_w, new_h), interpolation=interpolation)
                
                # Convert the normalized and resized fundus to a PIL image
                image_to_letterbox_pil = cv2_to_pil_grayscale(resized_cropped_fundus_cv2)

                if DEBUG_SAVE_INTERMEDIATE:
                    image_to_letterbox_pil.save(os.path.join(DEBUG_OUTPUT_DIR, f"{debug_name}_5_resized_cropped_fundus.png"))
            else:
                # Invalid bounding box (e.g., width or height is 0)
                print(f"Warning: Invalid fundus bounding box for {image_filename}. The image will be black.")
                image_to_letterbox_pil = Image.new('L', (1,1), 0) # Small placeholder image to be letterboxed
                null_bbox_count +=1
        else:
            # No bounding box found (no contour)
            print(f"Warning: No fundus bounding box for {image_filename}. The image will be black.")
            image_to_letterbox_pil = Image.new('L', (1,1), 0) # Placeholder image
            null_bbox_count +=1
            
        # 5. Letterbox: place the image (now the normalized fundus) into a 512x512 canvas
        #    The padding color is 0 (black) because the fundus background is already black.
        letterboxed_img_pil = letterbox_image(image_to_letterbox_pil, FINAL_IMAGE_SIZE, padding_color=0) 
        
        # 6. Convert back to a NumPy array (CV2) for saving
        final_img_to_save = pil_to_cv2_grayscale(letterboxed_img_pil)
        
        # 7. Save the processed image
        output_filename = os.path.splitext(image_filename)[0] + '.png'
        output_path = os.path.join(output_class_path, output_filename)
        cv2.imwrite(output_path, final_img_to_save)
        
    except Exception as e:
        print(f"Error processing {image_filename}: {e}")
        import traceback
        traceback.print_exc() # Print the full traceback for easier debugging
        error_count += 1
        continue

print(f"Number of images that could not be read or had processing errors: {error_count}")
print(f"Number of images with null or invalid fundus bounding box: {null_bbox_count}")
print("Splitting and preprocessing complete!")

In [None]:
## Messidor-2 Check
messidor_base_path = "/home/jupyter-sdm/GENITO/LAVORO_COMPLETO/Dataset_resize/4_Messidor2"
total_images_messidor = sum(len(os.listdir(os.path.join(messidor_base_path, str(i)))) for i in range(5))

print(f"Total number of images in the Messidor-2 dataset: {total_images_messidor}")

for cls in range(5):
    num_images = len(os.listdir(os.path.join(messidor_base_path, str(cls))))
    print(f"Class {cls}: {num_images} images")

# 5 FGADR

## FGADR Training Set

In [None]:
# --- File and folder paths ---
csv_path = '/home/jupyter-sdm/GENITO/DATASETS/5_FGADR/Seg-set/DR_Seg_Grading_Label.csv'
image_dir = '/home/jupyter-sdm/GENITO/DATASETS/5_FGADR/Seg-set/Original_Images'
final_output_dir = "/home/jupyter-sdm/GENITO/LAVORO_COMPLETO/Dataset_resize/5_FGADR/"

column_class_name = 'grade'
colum_image_name = 'image'

try:
    df = pd.read_csv(csv_path)
except FileNotFoundError:
    print(f"Error: CSV file not found at {csv_path}")
    exit()

# Create output directories for each class
classes = df[column_class_name].unique()
for cls in classes:
    class_output_path = os.path.join(final_output_dir, str(cls))
    os.makedirs(class_output_path, exist_ok=True)

error_count = 0
null_bbox_count = 0

for _, row in tqdm(df.iterrows(), total=df.shape[0], desc="Processing Images", ncols=100, ascii=True):
    # The 'image' column in the CSV should contain the full filename with its extension
    image_filename = row[colum_image_name]
    image_class = str(row[column_class_name])
    src_path = os.path.join(image_dir, image_filename)
    output_class_path = os.path.join(final_output_dir, image_class)

    # Fallback check if the primary file path doesn't exist
    if not os.path.exists(src_path):
        src_path_jpeg = os.path.join(image_dir, image_filename + ".jpeg")
        if os.path.exists(src_path_jpeg):
            src_path = src_path_jpeg
            image_filename = image_filename + ".jpeg" # Update the filename for output
        else:
            print(f"Warning: {image_filename} not found in {image_dir}")
            error_count += 1
            continue
        
    img_bgr = cv2.imread(src_path, cv2.IMREAD_COLOR)
    
    if img_bgr is None:
        print(f"Error reading {src_path}")
        error_count += 1
        continue
    
    try:
        # 1. Convert to Grayscale
        img_cv2_gray = convert_to_grayscale(img_bgr)

        # 2. Segment the fundus, create the mask, and get the bounding box
        fundus_mask, fundus_bbox = segment_fundus_and_create_mask(img_cv2_gray.copy(), image_filename)

        if DEBUG_SAVE_INTERMEDIATE:
            debug_name = os.path.splitext(image_filename)[0]
            cv2.imwrite(os.path.join(DEBUG_OUTPUT_DIR, f"{debug_name}_0_gray.png"), img_cv2_gray)
            cv2.imwrite(os.path.join(DEBUG_OUTPUT_DIR, f"{debug_name}_1_mask.png"), fundus_mask)

        # 3. Apply CLAHE, Gaussian Blur, and Median Filter to the grayscale image
        img_clahe = apply_clahe(img_cv2_gray)
        img_gaussian_blurred = apply_gaussian_blur(img_clahe)
        img_median_filtered = apply_median_filter(img_gaussian_blurred)
        fully_processed_gray_data = img_median_filtered

        if DEBUG_SAVE_INTERMEDIATE:
             cv2.imwrite(os.path.join(DEBUG_OUTPUT_DIR, f"{debug_name}_2_fully_processed_gray.png"), fully_processed_gray_data)

        # 4. Apply the mask to black out the background of the processed image
        masked_processed_fundus_cv2 = cv2.bitwise_and(fully_processed_gray_data, fully_processed_gray_data, mask=fundus_mask)

        if DEBUG_SAVE_INTERMEDIATE:
            cv2.imwrite(os.path.join(DEBUG_OUTPUT_DIR, f"{debug_name}_3_masked_fundus.png"), masked_processed_fundus_cv2)

        # --- Fundus Size Normalization ---
        if fundus_bbox:
            x, y, w_bbox, h_bbox = fundus_bbox
            if w_bbox > 0 and h_bbox > 0:
                # Crop the masked fundus using the bounding box
                cropped_fundus_cv2 = masked_processed_fundus_cv2[y:y+h_bbox, x:x+w_bbox]

                if DEBUG_SAVE_INTERMEDIATE:
                    cv2.imwrite(os.path.join(DEBUG_OUTPUT_DIR, f"{debug_name}_4_cropped_fundus.png"), cropped_fundus_cv2)

                # Calculate the new dimensions for the cropped fundus
                target_max_dim_px = int(max(FINAL_IMAGE_SIZE) * FUNDUS_TARGET_SCALE_FACTOR)
                
                current_max_dim_bbox = max(w_bbox, h_bbox)
                scale_ratio = target_max_dim_px / current_max_dim_bbox if current_max_dim_bbox > 0 else 1
                
                new_w = int(w_bbox * scale_ratio)
                new_h = int(h_bbox * scale_ratio)
                
                # Ensure the new dimensions are at least 1x1
                new_w = max(1, new_w)
                new_h = max(1, new_h)

                interpolation = cv2.INTER_AREA if scale_ratio < 1 else cv2.INTER_CUBIC
                resized_cropped_fundus_cv2 = cv2.resize(cropped_fundus_cv2, (new_w, new_h), interpolation=interpolation)
                
                # Convert the normalized and resized fundus to a PIL image
                image_to_letterbox_pil = cv2_to_pil_grayscale(resized_cropped_fundus_cv2)

                if DEBUG_SAVE_INTERMEDIATE:
                    image_to_letterbox_pil.save(os.path.join(DEBUG_OUTPUT_DIR, f"{debug_name}_5_resized_cropped_fundus.png"))
            else:
                # Invalid bounding box (e.g., width or height is 0)
                print(f"Warning: Invalid fundus bounding box for {image_filename}. The image will be black.")
                image_to_letterbox_pil = Image.new('L', (1,1), 0) # Small placeholder image to be letterboxed
                null_bbox_count +=1
        else:
            # No bounding box found (no contour)
            print(f"Warning: No fundus bounding box for {image_filename}. The image will be black.")
            image_to_letterbox_pil = Image.new('L', (1,1), 0) # Placeholder image
            null_bbox_count +=1
            
        # 5. Letterbox: place the image (now the normalized fundus) into a 512x512 canvas
        #    The padding color is 0 (black) because the fundus background is already black.
        letterboxed_img_pil = letterbox_image(image_to_letterbox_pil, FINAL_IMAGE_SIZE, padding_color=0) 
        
        # 6. Convert back to a NumPy array (CV2) for saving
        final_img_to_save = pil_to_cv2_grayscale(letterboxed_img_pil)
        
        # 7. Save the processed image
        output_filename = os.path.splitext(image_filename)[0] + '.png'
        output_path = os.path.join(output_class_path, output_filename)
        cv2.imwrite(output_path, final_img_to_save)
        
    except Exception as e:
        print(f"Error processing {image_filename}: {e}")
        import traceback
        traceback.print_exc() # Print the full traceback for easier debugging
        error_count += 1
        continue

print(f"Number of images that could not be read or had processing errors: {error_count}")
print(f"Number of images with null or invalid fundus bounding box: {null_bbox_count}")
print("Splitting and preprocessing complete!")

In [None]:
## FGADR Check
fgadr_base_path = "/home/jupyter-sdm/GENITO/LAVORO_COMPLETO/Dataset_resize/5_FGADR"
total_images_fgadr = sum(len(os.listdir(os.path.join(fgadr_base_path, str(i)))) for i in range(5))

print(f"Total number of images in the FGADR dataset: {total_images_fgadr}")

for cls in range(5):
    num_images = len(os.listdir(os.path.join(fgadr_base_path, str(cls))))
    print(f"Class {cls}: {num_images} images")

# 6 RLDR

## RLDR Training Set 

In [None]:
# Percorsi file e cartelle
csv_path = '/home/jupyter-sdm/GENITO/DATASETS/6_RLDR/retinal-lesions-v20191227/dr_grades.csv'  
image_dir = '/home/jupyter-sdm/GENITO/DATASETS/6_RLDR/retinal-lesions-v20191227/images_896x896' 
final_output_dir = "/home/jupyter-sdm/GENITO/LAVORO_COMPLETO/Dataset_resize/6_RLDR/"

file_extension = ".jpg"
column_class_name = 'our label'
colum_image_name = 'image id'

try:
    df = pd.read_csv(csv_path)
except FileNotFoundError:
    print(f"Errore: File CSV non trovato a {csv_path}")
    exit()

classes = df[column_class_name].unique()
for cls in classes:
    class_output_path = os.path.join(final_output_dir, str(cls))
    os.makedirs(class_output_path, exist_ok=True)

conta_errate = 0
conta_bbox_nulle = 0

for _, row in tqdm(df.iterrows(), total=df.shape[0], desc="Processing", ncols=100, ascii=True):
    image_name_no_ext = row[colum_image_name]
    image_name = image_name_no_ext + file_extension # Verrà aggiornato se si trova .jpeg
    image_class = str(row[column_class_name])
    src_path = os.path.join(image_dir, image_name_no_ext + file_extension) # Prova prima .jpg
    output_class_path = os.path.join(final_output_dir, image_class)

    if not os.path.exists(src_path):
        src_path_jpeg = os.path.join(image_dir, image_name_no_ext + ".jpeg")
        if os.path.exists(src_path_jpeg):
            src_path = src_path_jpeg
            image_name = image_name_no_ext + ".jpeg" # Aggiorna il nome file per l'output
        else:
            print(f"Attenzione: {image_name_no_ext} (con estensioni .jpg/.jpeg) non trovato in {image_dir}")
            conta_errate += 1
            continue
        
    img_bgr = cv2.imread(src_path, cv2.IMREAD_COLOR)
    
    if img_bgr is None:
        print(f"Errore nella lettura di {src_path}")
        conta_errate += 1
        continue
    
    try:
        # 1. Converti in Scala di Grigi
        img_cv2_gray = convert_to_grayscale(img_bgr)

        # 2. Segmenta il fondo, crea la maschera e ottieni il bounding box
        fundus_mask, fundus_bbox = segment_fundus_and_create_mask(img_cv2_gray.copy(), image_name)

        if DEBUG_SAVE_INTERMEDIATE:
            cv2.imwrite(os.path.join(DEBUG_OUTPUT_DIR, f"{image_name_no_ext}_0_gray.png"), img_cv2_gray)
            cv2.imwrite(os.path.join(DEBUG_OUTPUT_DIR, f"{image_name_no_ext}_1_mask.png"), fundus_mask)

        # 3. Applica CLAHE, Gaussian Blur, Median Filter all'immagine in scala di grigi
        img_clahe = apply_clahe(img_cv2_gray)
        img_gaussian_blurred = apply_gaussian_blur(img_clahe)
        img_median_filtered = apply_median_filter(img_gaussian_blurred)
        fully_processed_gray_data = img_median_filtered

        if DEBUG_SAVE_INTERMEDIATE:
             cv2.imwrite(os.path.join(DEBUG_OUTPUT_DIR, f"{image_name_no_ext}_2_fully_processed_gray.png"), fully_processed_gray_data)

        # 4. Applica la maschera per annerire lo sfondo dell'immagine processata
        masked_processed_fundus_cv2 = cv2.bitwise_and(fully_processed_gray_data, fully_processed_gray_data, mask=fundus_mask)

        if DEBUG_SAVE_INTERMEDIATE:
            cv2.imwrite(os.path.join(DEBUG_OUTPUT_DIR, f"{image_name_no_ext}_3_masked_fundus.png"), masked_processed_fundus_cv2)

        # --- Normalizzazione della Dimensione del Fondo ---
        if fundus_bbox:
            x, y, w_bbox, h_bbox = fundus_bbox
            if w_bbox > 0 and h_bbox > 0:
                # Ritaglia il fondo mascherato usando il bounding box
                cropped_fundus_cv2 = masked_processed_fundus_cv2[y:y+h_bbox, x:x+w_bbox]

                if DEBUG_SAVE_INTERMEDIATE:
                    cv2.imwrite(os.path.join(DEBUG_OUTPUT_DIR, f"{image_name_no_ext}_4_cropped_fundus.png"), cropped_fundus_cv2)

                # Calcola le nuove dimensioni per il fondo ritagliato
                target_max_dim_px = int(max(FINAL_IMAGE_SIZE) * FUNDUS_TARGET_SCALE_FACTOR)
                
                current_max_dim_bbox = max(w_bbox, h_bbox)
                scale_ratio = target_max_dim_px / current_max_dim_bbox if current_max_dim_bbox > 0 else 1
                
                new_w = int(w_bbox * scale_ratio)
                new_h = int(h_bbox * scale_ratio)
                
                # Assicura che le nuove dimensioni siano almeno 1x1
                new_w = max(1, new_w)
                new_h = max(1, new_h)

                interpolation = cv2.INTER_AREA if scale_ratio < 1 else cv2.INTER_CUBIC
                resized_cropped_fundus_cv2 = cv2.resize(cropped_fundus_cv2, (new_w, new_h), interpolation=interpolation)
                
                # Converti il fondo normalizzato e ridimensionato in PIL
                image_to_letterbox_pil = cv2_to_pil_grayscale(resized_cropped_fundus_cv2)

                if DEBUG_SAVE_INTERMEDIATE:
                    image_to_letterbox_pil.save(os.path.join(DEBUG_OUTPUT_DIR, f"{image_name_no_ext}_5_resized_cropped_fundus.png"))
            else:
                # Bounding box non valido (es. w o h = 0)
                print(f"Attenzione: Bounding box del fondo non valido per {image_name}. L'immagine sarà nera.")
                image_to_letterbox_pil = Image.new('L', (1,1), 0) # Immagine placeholder piccola da letterboxare
                conta_bbox_nulle +=1
        else:
            # Nessun bounding box trovato (nessun contorno)
            print(f"Attenzione: Nessun bounding box del fondo per {image_name}. L'immagine sarà nera.")
            image_to_letterbox_pil = Image.new('L', (1,1), 0) # Immagine placeholder
            conta_bbox_nulle +=1
            
        # 5. Letterbox: inserisce l'immagine (ora il fondo normalizzato) in un canvas 512x512
        #    Il colore di padding è 0 (nero) perché lo sfondo del fondo è già nero.
        letterboxed_img_pil = letterbox_image(image_to_letterbox_pil, FINAL_IMAGE_SIZE, padding_color=0) 
        
        # 6. Converti di nuovo in NumPy array (CV2) per il salvataggio
        final_img_to_save = pil_to_cv2_grayscale(letterboxed_img_pil)
        
        # 7. Salva l'immagine elaborata
        output_filename = os.path.splitext(image_name)[0] + '.png'
        output_path = os.path.join(output_class_path, output_filename)
        cv2.imwrite(output_path, final_img_to_save)
        
    except Exception as e:
        print(f"Errore nel processing di {image_name}: {e}")
        import traceback
        traceback.print_exc() # Stampa il traceback completo per un debug più facile
        conta_errate += 1
        continue

print(f"Numero di immagini non lette o con errori di processing: {conta_errate}")
print(f"Numero di immagini con bounding box del fondo nullo o non valido: {conta_bbox_nulle}")
print("Processo di suddivisione e preprocessing completato!")

In [None]:
## RLDR Check
rldr_base_path = "/home/jupyter-sdm/GENITO/LAVORO_COMPLETO/Dataset_resize/6_RLDR"
total_images_rldr = sum(len(os.listdir(os.path.join(rldr_base_path, str(i)))) for i in range(5))

print(f"Total number of images in the RLDR dataset: {total_images_rldr}")

for cls in range(5):
    num_images = len(os.listdir(os.path.join(rldr_base_path, str(cls))))
    print(f"Class {cls}: {num_images} images")