In [16]:
#updates on this code:
# it is able to adjust the number of images we want to use
# adds functionality to check for corrupted images (one of the images is corrupted and I don't know which yet)

#Potential problems:
# the image preprocessing is done as a code line that may have to be run again
# It would probably be better to run our preprocessing on all the images separately and potentially save them as new images
# Probably best to do this after we crop out everything but the optic disk

# There's also no current way to pause tensor flow or any of the processes

In [17]:
#code written by Matthew Miller (adapted from chatGPT and prior code from class)

In [18]:
import tensorflow as tf
from tensorflow.keras import layers, models

import pandas as pd
import matplotlib.pyplot as plt
import cv2
import os
import numpy as np

import random

## Set the image folder and read the CSV file (the CSV files and the images must be in a single folder)

In [19]:
# Folder containing the image files and the corresponding CSV file
folder_path = "Glaucoma_Balanced_Dataset/JustRAIGS"
csv_file = "JustRAIGS_Train_labels_balance.csv"
csv_path = os.path.join(folder_path,csv_file)
print(csv_path)

# Read the CSV file containing the image filenames and classifications
data = pd.read_csv(csv_path)
#print(data)

Glaucoma_Balanced_Dataset/JustRAIGS\JustRAIGS_Train_labels_balance.csv


## !!!!!!!  CAUTION  !!!!!!! DO NOT RUN WITH A LARGE NUMBER OF DATA FILES IF YOU DON'T WANT TO WAIT

In [28]:
# Define the number of random images to select
num_random_images = 2500  # Change this number as needed from 2 to 6540


# Define empty lists to store the image data and corresponding classifications
images = []
classifications = []

# Define the target size for resizing the images
## This step is limiting our resolution, probably way too aggressive eventually, but fine for testing
target_size = (100, 100)  # Adjust the size as needed

#target_size = (1944, 1944)  # this should make the images square initally and size down to the most common
#smallest dimension for a few of the images.
#Going larger may force interpolation, which wouldn't be ideal.
# this method is also distorting the aspect ratio, but that won't matter later since the optic disk is what we care about,
#and we can define a constant size for that after preprocessing

# Shuffle the rows in the DataFrame to ensure randomness
data_shuffled = data.sample(frac=1)


# Iterate over each row in the CSV file
for index, row in data_shuffled.head(num_random_images).iterrows():
    # Read the image file
    image_filename = row['Eye ID']  + ".jpg" # The column containing the image filenames
    #print(image_filename)
    image_path = os.path.join(folder_path, image_filename)
    #print(image_path)
    
    
    
    # Check if the image file exists
    if os.path.exists(image_path):
        # Load the image
    
    
        image = cv2.imread(image_path)  # Use cv2.imread for reading images
    
    
        # Check if the image was loaded successfully
        if image is not None: 
    
            # Preprocess the image as necessary (e.g., resizing, hist eq, normalization)  
            # Preprocess the image by resizing it to the target size
            image_resized = cv2.resize(image, target_size)


            b, g, r = cv2.split(image_resized)
            # Perform histogram equalization and normalization on each channel
            b_eq = cv2.equalizeHist(b)
            g_eq = cv2.equalizeHist(g)
            r_eq = cv2.equalizeHist(r)
            # Merge the equalized channels back into a color image
            image_eq = cv2.merge((b_eq, g_eq, r_eq))

            # Normalize the image intensity values
            image_eq = cv2.normalize(image_eq, None, alpha=0, beta=255, norm_type=cv2.NORM_MINMAX)


            # Add the preprocessed image to the images list
            images.append(image_eq)

            # Display the image if testing is necessary(RGB stuff is so the images display with right colors (BGR to RGB))
            #image_rgb = cv2.cvtColor(image_resized, cv2.COLOR_BGR2RGB)
            #plt.imshow(image_rgb)
            #plt.show()

            # Get the classification label
            classification = row['Final Label']  # Assuming 'Classification' is the column containing the classifications
            classifications.append(classification)
            
        else:
            print(f"Failed to load image: {image_path}")
    else:
        print(f"Image file not found: {image_path}") 
            

Image file not found: Glaucoma_Balanced_Dataset/JustRAIGS\TRAIN000426.jpg


In [29]:
# Convert the lists to numpy arrays for further processing
images = np.array(images)
#print(images)

#This next line works correctly for reading the classifications
classifications = np.array(classifications)
#print(classifications)
# Now you can use the images and classifications for training your model

## Need to split the data now into training and validation

In [30]:
from sklearn.model_selection import train_test_split

# Split the data into training and validation sets
#0.2 is setting the validation data size to 20% of the total (80/20 split) and the random state is the seed
#This is randomly picking data points, which is good
X_train, X_val, y_train, y_val = train_test_split(images, classifications, test_size=0.2, random_state=42)

# Print the sizes of the training and validation sets
print("Number of training samples:", len(X_train))
print("Number of validation samples:", len(X_val))


Number of training samples: 1999
Number of validation samples: 500


## This is our neural network, the parameters and network architecture are just an example

## This is running a little too quickly rn, we might need more layers or depth

In [31]:

# Step 3: Build the Neural Network Model
#change the image height and width (100x100 for this example to fit the output of the preprocessed images above)
#make the image height equal to the image width for the future

image_height = 100
image_width = 100

model = models.Sequential([
    layers.Conv2D(32, (3, 3), activation='relu', input_shape=(image_height, image_width, 3)),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),
    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.Flatten(),
    layers.Dense(64, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

# Step 4: Compile the Model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Step 5: Train the Model
#history = model.fit(train_images, train_labels, epochs=10, validation_data=(val_images, val_labels))

history = model.fit(X_train, y_train, epochs=1, validation_data=(X_val, y_val))


# Step 6: Evaluate the Model
#test_loss, test_acc = model.evaluate(test_images, test_labels)
#print('Test accuracy:', test_acc)

loss, accuracy = model.evaluate(X_val, y_val)
print("Validation accuracy:", accuracy)


[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 135ms/step - accuracy: 0.5293 - loss: 19.7085 - val_accuracy: 0.6060 - val_loss: 0.6770
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 38ms/step - accuracy: 0.5904 - loss: 0.6816
Validation accuracy: 0.6060000061988831
