# Diabetic Retinopathy Detection - Jeffery A. Brown

# Dependencies

In [35]:
# %matplotlib inline
# %matplotlib notebook
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D    # Support 3D graphing

import numpy as np
import pandas as pd
from pandas.plotting import table
import math
import random
import shutil
import os

# from imutils import paths
# import argparse
# import pickle
# import cv2

from pprint import pprint

# Visualization
import graphviz
# import pydotplus
from IPython.display import Image

# Machine Learning - Data Preparation and Pre-Processing
from sklearn.model_selection import train_test_split # Split data into training and testing samples
from sklearn.model_selection import cross_val_score  # Score a model using k-fold or other cross validation

from sklearn.preprocessing import OneHotEncoder   # Convert categorical integer features (X) to One-Hot encoded values
from sklearn.preprocessing import LabelEncoder    # Convert categorical labeled values to categorical integer values
from sklearn.preprocessing import LabelBinarizer  # Convert categorical labeled values to Binary encoded values

from sklearn.preprocessing import StandardScaler  # Scale numerical features to standard normal distribution
from sklearn.preprocessing import MinMaxScaler    # Scale numerical values based upon mix/max values

# Machine Learning - Sci-Kit Learn - Models - Regression
from sklearn.linear_model import LinearRegression  # TBD
from sklearn.linear_model import Lasso             # TBD
from sklearn.linear_model import Ridge             # TBD
from sklearn.linear_model import ElasticNet        # TBD

# Machine Learning - Sci-Kit Learn - Models - Classification
from sklearn.linear_model import LogisticRegression   # Logistic Regression Classifier
from sklearn import tree                              # Decision Tree Classifier
from sklearn.ensemble import RandomForestClassifier   # Random Forest Classifier
from sklearn import svm                               # Support Vector Machine Classifier
from sklearn.neighbors import KNeighborsClassifier    # K-Nearest Neighbors (KNN)

# Machine Learning - GridSearch for Hyper-Parameter tuning
from sklearn.model_selection import GridSearchCV      # Grid Search

# Machine Learning - Quantify Model Performance
from sklearn.metrics import mean_squared_error    # Mean Squared Error (MSE) metric
from sklearn.metrics import r2_score              # R-squared (Coefficient of Determination) metric
from sklearn.metrics import confusion_matrix      # Generate a confusion matrix (actual vs. predicted counts)
from sklearn.metrics import classification_report # Calculate metrics for prediction performance
from sklearn.metrics import precision_score       # Calculate the precision: Tp / (Tp + Fp) => Ability to avoid false negatives
from sklearn.metrics import recall_score          # Calculate the recall: Tp / (Tp + Fn) => Ability to find all positive samples
from sklearn.metrics import f1_score              # Calculate the F1 score: 2*(precision*recall)/(precision+recall)

# Machine Learning - Dataset Generation
from sklearn.datasets import make_regression     # Generate linear data
from sklearn.datasets import make_s_curve        # Generate nonlinear data
from sklearn.datasets import make_blobs          # Generate blobs for classification
from sklearn.datasets import make_circles        # Generate circles for classification
from sklearn.datasets import load_iris           # Sample multi-class dataset for classification
from sklearn.datasets import make_classification # Generate datasets for classification

# Machine Learning - Keras (Tensorflow) - Models
from keras.models import Sequential               # Sequential model serving as foundation for neural network
# from keras.layers import Dense                    # Nodes for specifying input, hidden, and output layers
from keras.layers.core import Activation
from keras.layers.core import Flatten
from keras.layers.core import Dropout
from keras.layers.core import Dense

# Machine Learning - Keras (Tensorflow) - Convolutional Neural Networks (Image Classification)
from keras.layers.normalization import BatchNormalization
from keras.layers.convolutional import Conv2D
from keras.layers.convolutional import MaxPooling2D
from keras import backend as K

# Machine Learning - Keras (Tensorflow) - Optimizers
from keras.optimizers import SGD
from keras.optimizers import Adam

# Machine Learning - Keras (Tensorflow) - Encoding
from keras.utils import to_categorical            # One-Hot Encoder provided through Keras

# Machine Learning - Keras (Tensorflow) - Other related Tools
from keras.utils import plot_model                # Plot a neural network model
from keras.models import load_model               # Load a saved machine learning model
from keras.preprocessing import image             # Loads an image for application of machine learning
from keras.preprocessing.image import img_to_array # Converts an image to a numpy array
from keras.preprocessing.image import ImageDataGenerator

# Machine Learning - Keras (Tensorflow) -  Dataset Generation
from keras.datasets import mnist                  # Images: Handwritten digits 0-9 (28x28 grayscale, 60K train, 10K test)

# Predict the Class for Official Test Images

## Load the Saved Convolutional Neural Network Model

In [36]:
# Load a saved model
model = load_model("../input/saved-model-jab/saved_model.h5")

## Copy the Official Test Data to a Structure Compliant with ImageDataGenerator flow_from_directory function

In [37]:
# Function to copy the official test data into a folder structure that
# can be used with ImageDataGenerator and the flow_from_directory method,
# which requires the test images to be in a *subfolder* under the main test data folder!
# So unfortunately, since cannot change the test data folder structure I need to *copy*
# all of the test data to another folder with proper structure
def copy_official_test_images( a_source_dir = None, a_dest_dir = None ):
    
    # Basic argument checking
    if (a_source_dir is None) or (a_dest_dir is None):
        print(f"Error: Source or Destination directory not specified")
        return False
            
    # Create the destination folder if it doesn't already exist
    if not os.path.exists(a_dest_dir):
            os.mkdir(a_dest_dir)

    # Create the required subfolder in the destination folder if it doesn't already exist
    dir_subfolder = os.path.join(a_dest_dir, "test_subfolder")
    if not os.path.exists(dir_subfolder):
            os.mkdir(dir_subfolder)

    # Copy all images in the a_src_folder to the newly created dir_subfolder
    src_files = os.listdir(a_source_dir)
    
    for file_name in src_files:
        
        full_file_name = os.path.join(a_source_dir, file_name)
        
        if os.path.isfile(full_file_name):
            print(f"Copying image {full_file_name} to {dir_subfolder}... ", end='')
            try:
                shutil.copy2(full_file_name, dir_subfolder)
                print(f"done.")

            except FileNotFoundError:
                print(f"error while attempting to copy.")

In [38]:
# data/test_images - Source of official testing images in a single folder (used only to submit official results)
dir_src_test = "../input/aptos2019-blindness-detection/test_images"
# dir_src_test = "./data/test_images"
dir_official_test = "./official_test"

In [39]:
# Tried to copy the image files to address the keras ImageDataGenerator incompatibility with the Kaggle competition file structure, but exceeded a "500 file" limit!
# copy_official_test_images( dir_src_test, dir_official_test )

# So, will just try to see if specifying the test image folder directly will work anyway
# dir_official_test = dir_src_test

In [40]:
# Count how many test images are populated
test_tally_file = "../input/aptos2019-blindness-detection/test.csv"
# test_tally_file = "./data/test.csv"

# Load test counts into a dataframe
test_tally_df = pd.read_csv(test_tally_file)
len(test_tally_df)

1928

## Configure the Image Generator for the Official Test Images

In [42]:
# image_dims = (2136, 3216, 3)   # Height, Width, Depth (RGB = 3, Grayscale = 1)
image_dims = (96, 96, 3)         # Height, Width, Depth (RGB = 3, Grayscale = 1)

# Create a Image Data Generator for the official testing data
official_test_datagen = ImageDataGenerator()

# Define generator batch size and step size
official_test_n_images = len(test_tally_df)
official_test_n_batch_size = 1
official_test_n_step_size = official_test_n_images // official_test_n_batch_size

# Associate the generator with the training images
official_test_generator = official_test_datagen.flow_from_directory(
    directory="../input/aptos2019-blindness-detection",             # Folder containing a subfolder with official training images
    classes=["test_images"],   # Try to force this generator to use only the test images folder
    class_mode='categorical',                # Number of classes > 2, so using "categorical" vs "binary"
    target_size=(image_dims[1], image_dims[0]),       # (Width, Height)
    color_mode="rgb",               # Color images
    batch_size=official_test_n_batch_size,   # Batch size
    shuffle=False,                   # Shuffle the training samples randomly
    seed=1                          # Random number seed to use for shuffle
)

Found 1928 images belonging to 1 classes.


## Predict the Classifications based upon the Images

In [None]:
# Predict the output (using the test_generator)
official_test_generator.reset()
official_pred_onehot = model.predict_generator(official_test_generator,
                             steps=len(official_test_generator),
                             verbose=1)

official_predicted_class = np.argmax(official_pred_onehot, axis=1)
official_predicted_class

 343/1928 [====>.........................] - ETA: 1:06

## Save the Predictions from the Official Test Data

In [0]:
# Save the prediction results to a file

# Get the list of test image id codes used by the test generator to make the predictions
official_test_id_code_list = [ (x.split("/"))[1].replace(".png","") for x in official_test_generator.filenames ]

In [0]:

# Associate the test image id codes and the preoductions in a dataframe
official_results_df = pd.DataFrame({"id_code": official_test_id_code_list, "diagnosis": official_predicted_class})
official_results_df.to_csv("./submission.csv",index=False)
# official_results_df.to_csv("../input/submission.csv",index=False)
official_results_df

In [0]:
# !ls -al ../working