# Eyesense

Ocular Disease Intelligent Recognition (ODIR) is a structured ophthalmic database of 5,000 patients with age, color fundus photographs from left and right eyes and doctors' diagnostic keywords from doctors.

This dataset is meant to represent ‘‘real-life’’ set of patient information collected by Shanggong Medical Technology Co., Ltd. from different hospitals/medical centers in China. In these institutions, fundus images are captured by various cameras in the market, such as Canon, Zeiss and Kowa, resulting into varied image resolutions.
Annotations were labeled by trained human readers with quality control management. They classify patient into eight labels including:

-Normal (N),
-Diabetes (D),
-Glaucoma (G),
-Cataract (C),
-Age related Macular Degeneration (A),
-Hypertension (H),
-Pathological Myopia (M),
-Other diseases/abnormalities (O)

In [1]:
import os
from shutil import copy, move
import glob
import pathlib
import pandas as pd
import numpy as np
import cv2
from PIL import Image, ImageEnhance
import random
from scipy.ndimage import gaussian_filter

In [2]:
df = pd.read_csv("raw_data/full_df.csv")
df.head()

Unnamed: 0,ID,Patient Age,Patient Sex,Left-Fundus,Right-Fundus,Left-Diagnostic Keywords,Right-Diagnostic Keywords,N,D,G,C,A,H,M,O,filepath,labels,target,filename
0,0,69,Female,0_left.jpg,0_right.jpg,cataract,normal fundus,0,0,0,1,0,0,0,0,../input/ocular-disease-recognition-odir5k/ODI...,['N'],"[1, 0, 0, 0, 0, 0, 0, 0]",0_right.jpg
1,1,57,Male,1_left.jpg,1_right.jpg,normal fundus,normal fundus,1,0,0,0,0,0,0,0,../input/ocular-disease-recognition-odir5k/ODI...,['N'],"[1, 0, 0, 0, 0, 0, 0, 0]",1_right.jpg
2,2,42,Male,2_left.jpg,2_right.jpg,laser spot，moderate non proliferative retinopathy,moderate non proliferative retinopathy,0,1,0,0,0,0,0,1,../input/ocular-disease-recognition-odir5k/ODI...,['D'],"[0, 1, 0, 0, 0, 0, 0, 0]",2_right.jpg
3,4,53,Male,4_left.jpg,4_right.jpg,macular epiretinal membrane,mild nonproliferative retinopathy,0,1,0,0,0,0,0,1,../input/ocular-disease-recognition-odir5k/ODI...,['D'],"[0, 1, 0, 0, 0, 0, 0, 0]",4_right.jpg
4,5,50,Female,5_left.jpg,5_right.jpg,moderate non proliferative retinopathy,moderate non proliferative retinopathy,0,1,0,0,0,0,0,0,../input/ocular-disease-recognition-odir5k/ODI...,['D'],"[0, 1, 0, 0, 0, 0, 0, 0]",5_right.jpg


# Image categorization

In [3]:
dir_path = '../raw_data/raw_data/ODIR-5K/ODIR-5K/Training Images/'   #According to my computer.
img_list = os.listdir(dir_path)

We're not going to use the images categorized as 'Others'

In [4]:
if os.path.isdir('data_train/normal') is False:
    os.makedirs('data_train/normal')
    os.makedirs('data_train/diabets')
    os.makedirs('data_train/glaucoma')
    os.makedirs('data_train/cataract')
    os.makedirs('data_train/degeneration')
    os.makedirs('data_train/hypertension')
    os.makedirs('data_train/myopia')

In [5]:
if os.path.isdir('data_val/normal') is False:
    os.makedirs('data_val/normal')
    os.makedirs('data_val/diabets')
    os.makedirs('data_val/glaucoma')
    os.makedirs('data_val/cataract')
    os.makedirs('data_val/degeneration')
    os.makedirs('data_val/hypertension')
    os.makedirs('data_val/myopia')

In [6]:
if os.path.isdir('data_test/normal') is False:
    os.makedirs('data_test/normal')
    os.makedirs('data_test/diabets')
    os.makedirs('data_test/glaucoma')
    os.makedirs('data_test/cataract')
    os.makedirs('data_test/degeneration')
    os.makedirs('data_test/hypertension')
    os.makedirs('data_test/myopia')

In [7]:
if len(os.listdir('data_train/normal')) == 0:  # Check if the directory is empty

    for file in df.filename[df.labels == "['N']"]:
        copy(os.path.join(dir_path, file) , 'data_train/normal')
    for file in df.filename[df.labels == "['D']"]:
        copy(os.path.join(dir_path, file) , 'data_train/diabets')
    for file in df.filename[df.labels == "['G']"]:
        copy(os.path.join(dir_path, file) , 'data_train/glaucoma')
    for file in df.filename[df.labels == "['C']"]:
        copy(os.path.join(dir_path, file) , 'data_train/cataract')
    for file in df.filename[df.labels == "['A']"]:
        copy(os.path.join(dir_path, file) , 'data_train/degeneration')
    for file in df.filename[df.labels == "['H']"]:
        copy(os.path.join(dir_path, file) , 'data_train/hypertension')
    for file in df.filename[df.labels == "['M']"]:
        copy(os.path.join(dir_path, file) , 'data_train/myopia')

else:
    print("The directory 'data_train/normal' is not empty")
    print(f"\nProbably the files from {dir_path} were already copied into 'data_train/normal'.")
   

In [8]:
import random

source_paths = ['data_train/normal', 'data_train/diabets', 'data_train/glaucoma', 'data_train/cataract', 
                'data_train/degeneration', 'data_train/hypertension',
                'data_train/myopia']  

# Validation dataset
if len(os.listdir('data_val/normal')) == 0:
    for source in source_paths:
        dest = source.replace('data_train', 'data_val')
        n_files = int(0.2*len(os.listdir(source)) )   #Taking 20% of each folder
        for file in random.sample(os.listdir(source), n_files): 
            move(f"{source}/{file}", dest)
else:
    print("The directory 'data_val/normal' is not empty")
    print(f"\nProbably the files from {dir_path} were already copied into 'data_val/normal'.")

# Test dataset
if len(os.listdir('data_test/normal')) == 0:
    for source in source_paths:
        dest = source.replace('data_train', 'data_test')
        n_files = int(0.1*len(os.listdir(source)) )   #Taking 10% of each folder
        for file in random.sample(os.listdir(source), n_files): 
            move(f"{source}/{file}", dest)
else:
    print("The directory 'data_test/normal' is not empty")
    print(f"\nProbably the files from {dir_path} were already copied into 'data_test/normal'.")

In [9]:
data_dir_train = pathlib.Path('data_train')
data_dir_val   = pathlib.Path('data_val')
data_dir_test  = pathlib.Path('data_test')

In [10]:
train_length = len(list(data_dir_train.glob('*/*.jpg')))
val_length   = len(list(data_dir_val.glob('*/*.jpg')))
test_length  = len(list(data_dir_test.glob('*/*.jpg')))

print(f"Train: {train_length}")
print(f"Val:   {val_length}")
print(f"Test:  {test_length}")

Train: 4100
Val:   1133
Test:  451


In [11]:
len(df[df.labels == "['O']"])

708

In [12]:
assert (test_length + val_length + train_length)  == (len(df) - len(df[df.labels == "['O']"]) )

# Data augmentation

In [13]:
def augment_fundus_image(image_path):
    """
    Load a fundus image, apply 8 different augmentations, and save the transformed versions
    with descriptive suffixes.
    
    Args:
        image_path (str): Path to the original fundus image
    """
    # Load the image
    img = Image.open(image_path)
    
    # Get the filename and directory
    img_dir, img_filename = os.path.split(image_path)
    filename, ext = os.path.splitext(img_filename)
    
    # 1. Rotation (±20 degrees)
    angle = random.uniform(-20, 20)
    rotated = img.rotate(angle, resample=Image.BICUBIC, expand=False)
    rotated_path = os.path.join(img_dir, f"{filename}_rot{angle:.1f}{ext}")
    rotated.save(rotated_path)
    
    # 2. Horizontal flip
    flipped_h = img.transpose(Image.FLIP_LEFT_RIGHT)
    flipped_h_path = os.path.join(img_dir, f"{filename}_fliph{ext}")
    flipped_h.save(flipped_h_path)
    
    # 3. Brightness adjustment (+15%)
    brightness_enhancer = ImageEnhance.Brightness(img)
    brightened = brightness_enhancer.enhance(1.15)
    brightened_path = os.path.join(img_dir, f"{filename}_bright+15{ext}")
    brightened.save(brightened_path)
    
    # 4. Brightness adjustment (-15%)
    darkened = brightness_enhancer.enhance(0.85)
    darkened_path = os.path.join(img_dir, f"{filename}_bright-15{ext}")
    darkened.save(darkened_path)
    
    # 5. Contrast adjustment (+15%)
    contrast_enhancer = ImageEnhance.Contrast(img)
    contrast_increased = contrast_enhancer.enhance(1.15)
    contrast_increased_path = os.path.join(img_dir, f"{filename}_contr+15{ext}")
    contrast_increased.save(contrast_increased_path)
    
    # 6. Slight Gaussian blur
    # Convert to numpy array for OpenCV processing
    img_np = np.array(img)
    blurred = cv2.GaussianBlur(img_np, (5, 5), 0)
    blurred_img = Image.fromarray(blurred)
    blurred_path = os.path.join(img_dir, f"{filename}_blur{ext}")
    blurred_img.save(blurred_path)
    
    # 7. Small translation
    width, height = img.size
    x_shift = int(width * 0.05)  # 5% shift
    y_shift = int(height * 0.05)  # 5% shift
    
    # Create a new image with the same size and black background
    translated = Image.new('RGB', (width, height))
    # Paste the original image with an offset
    translated.paste(img, (x_shift, y_shift))
    translated_path = os.path.join(img_dir, f"{filename}_trans{ext}")
    translated.save(translated_path)
    
    # 8. Slight scaling (zoom in 10%)
    scale_factor = 1.1
    
    # Calculate new dimensions
    new_width = int(width * scale_factor)
    new_height = int(height * scale_factor)
    
    # Resize image
    scaled = img.resize((new_width, new_height), Image.BICUBIC)
    
    # Crop back to original size (from center)
    left = (new_width - width) // 2
    top = (new_height - height) // 2
    right = left + width
    bottom = top + height
    scaled = scaled.crop((left, top, right, bottom))
    
    scaled_path = os.path.join(img_dir, f"{filename}_zoom10{ext}")
    scaled.save(scaled_path)
    
    #print(f"Successfully created 8 augmented versions of {img_filename}")

    return None

In [14]:
df["labels"].value_counts()

labels
['N']    2873
['D']    1608
['O']     708
['C']     293
['G']     284
['A']     266
['M']     232
['H']     128
Name: count, dtype: int64

In [15]:
path = "data_train" # folder created in pre_processing
dir_list = os.listdir(path)
dir_list  #We can note that there is no folder for category 'others'.

['degeneration',
 'myopia',
 'normal',
 'hypertension',
 'glaucoma',
 'cataract',
 'diabets']

In [16]:
dir_list.remove('normal') # the mais class
dir_list.remove('diabets') # the second class
dir_list

['degeneration', 'myopia', 'hypertension', 'glaucoma', 'cataract']

These are the categories that will have data augmentation

In [17]:
for category_name in dir_list:

    path = 'data_train'    
    category_path = os.path.join(path, category_name) #Ex: data_train/degeneration   
    image_list = os.listdir(category_path)
    image_path = [os.path.join(category_path, x) for x in image_list]

    for x in image_path:
        augment_fundus_image(x)

        
#Do the same for validation dataset
    path = 'data_val'
    category_path = os.path.join(path, category_name) #Ex: data_val/degeneration   
    image_list = os.listdir(category_path)
    image_path = [os.path.join(category_path, x) for x in image_list]

    for x in image_path:
        augment_fundus_image(x)
     
    
    print(f"{category_name} done")


degeneration done
myopia done
hypertension done
glaucoma done
cataract done


In [18]:
train_length = len(list(data_dir_train.glob('*/*.jpg')))
val_length  = len(list(data_dir_test.glob('*/*.jpg')))
test_length  = len(list(data_dir_test.glob('*/*.jpg')))


print(f"Train: {train_length}")
print(f"Val: {val_length}")
print(f"Test:  {test_length}")

Train: 11068
Val: 451
Test:  451


In [32]:
processed_number = 0
for category_name in dir_list:
    number_of_files = len(list(data_dir_train.glob(f'{category_name}/*.jpg')))
    print(f"For {category_name} in training dataset, there are {number_of_files} images")
    processed_number += number_of_files
     
    number_of_files = len(list(data_dir_val.glob(f'{category_name}/*.jpg')))
    print(f"For {category_name} in validation dataset, there are {number_of_files} images")
    processed_number += number_of_files

    number_of_files = len(list(data_dir_test.glob(f'{category_name}/*.jpg')))
    print(f"For {category_name} in test dataset, there are {number_of_files} images")
    print("---------------------------------------------------------------------")
    processed_number += number_of_files

print(f"Total of files: {processed_number}")

For degeneration in training dataset, there are 1728 images
For degeneration in validation dataset, there are 477 images
For degeneration in test dataset, there are 21 images
---------------------------------------------------------------------
For myopia in training dataset, there are 1512 images
For myopia in validation dataset, there are 414 images
For myopia in test dataset, there are 18 images
---------------------------------------------------------------------
For hypertension in training dataset, there are 837 images
For hypertension in validation dataset, there are 225 images
For hypertension in test dataset, there are 10 images
---------------------------------------------------------------------
For glaucoma in training dataset, there are 1854 images
For glaucoma in validation dataset, there are 504 images
For glaucoma in test dataset, there are 22 images
---------------------------------------------------------------------
For cataract in training dataset, there are 1908 im

In [40]:
unprocessed_number = len(df[df.labels == "['N']"]) + len(df[df.labels == "['D']"]) + len(df[df.labels == "['O']"])
                         
#assert ((processed_number/8 + unprocessed_number)  == len(df) )
processed_number/8

1259.375