# Data Processing

In [None]:
!unzip Data.zip

In [None]:
import os
from typing import *
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.preprocessing import image
from tensorflow.keras import layers
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.preprocessing.image import ImageDataGenerator

data_dir = "../../Data"
train_data_dir = Path(os.path.join(data_dir, "train"))
test_data_dir = Path(os.path.join(data_dir, "test"))
train_images_dir = Path(os.path.join(train_data_dir, "train"))
test_images_dir = Path(os.path.join(test_data_dir, "test"))

In [None]:
df = pd.read_csv(train_data_dir / "train.csv")

In [None]:
# get a basic idea of what the csv datat looks like
df.head()

Unnamed: 0,filename,opacity,diabetic retinopathy,glaucoma,macular edema,macular degeneration,retinal vascular occlusion,normal
0,c24a1b14d253.jpg,0,0,0,0,0,1,0
1,9ee905a41651.jpg,0,0,0,0,0,1,0
2,3f58d128caf6.jpg,0,0,1,0,0,0,0
3,4ce6599e7b20.jpg,1,0,0,0,1,0,0
4,0def470360e4.jpg,1,0,0,0,1,0,0


In [None]:
# Some statistics about the data, though they are not very useful since
# we are looking at binary encoded data, but still goot to look at
df.describe()

Unnamed: 0,opacity,diabetic retinopathy,glaucoma,macular edema,macular degeneration,retinal vascular occlusion,normal
count,3435.0,3435.0,3435.0,3435.0,3435.0,3435.0,3435.0
mean,0.446288,0.219796,0.173799,0.150218,0.167103,0.128093,0.152838
std,0.497179,0.414169,0.378992,0.357337,0.373122,0.334242,0.359884
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,1.0,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [None]:
# get some info about the data types, non-null counts. 
# Looks like no missing values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3435 entries, 0 to 3434
Data columns (total 8 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   filename                    3435 non-null   object
 1   opacity                     3435 non-null   int64 
 2   diabetic retinopathy        3435 non-null   int64 
 3   glaucoma                    3435 non-null   int64 
 4   macular edema               3435 non-null   int64 
 5   macular degeneration        3435 non-null   int64 
 6   retinal vascular occlusion  3435 non-null   int64 
 7   normal                      3435 non-null   int64 
dtypes: int64(7), object(1)
memory usage: 214.8+ KB


In [None]:
# rename the columns to not have any spaces
df.columns = ['filename', 'opacity', 'diabetic_retinopathy', 'glaucoma', 'macular_edema', 'macular_degeneration', 'retinal_vascular_occlusion', 'normal']

We are interested in classifying diabetic retinopaty and glaucoma vs normal
however, instead of throwing away everything else, let's add a 4th class
call it "other". This way the model can still learn from those images.

Let's add an additional column, that we will call **label**. Normal will be assigned 0, glaucoma 1, dr - 2 and every other myopathy will be 3


In [None]:
# initialize with all 3s, this way we only need to modify 
# the labels of the diseases we are interested in classifying
class_labels = {
    'normal': 0,
    'glaucoma': 1,
    'diabetic_retinopathy': 2,
    'other': 3
}

df['label'] = (len(class_labels) - 1) * np.ones(df.shape[0])

In [None]:
for class_name, label in class_labels.items():
  if class_name == 'other':
    continue
  df.loc[(df[class_name] == 1), 'label'] = int(label)
df.label = df.label.astype(np.int16)

In [None]:
# count how many we have in each class
# it looks like classes 0, 1, 2 are somewhat balanced 
# clearly the other class has significantly more entries, which may be a problem
# lets try subsampling the "other" class and only keep 600 random values
df.label.value_counts()

3    1577
2     755
1     578
0     525
Name: label, dtype: int64

In [None]:
subsampled_df = df[df.label == 3].sample(n=600)
no_other_df = df[df.label != 3]
new_df = pd.concat([no_other_df, subsampled_df])
new_df.label.value_counts()

2    755
3    600
1    578
0    525
Name: label, dtype: int64

For the next step we will create the train/validation splits.

We will use 70% as the training data and 30% as the validation data

In [None]:
# use a trick to not have to do complicated index manipulations
mask = np.random.rand(len(new_df)) < 0.7
train_df = new_df[mask]
val_df = new_df[~mask]

In [None]:
train_df.shape, val_df.shape

((1734, 9), (724, 9))

In [None]:
# let's examine the counts to make sure things are somewhat balanced
train_df.label.value_counts()

2    524
3    426
1    412
0    372
Name: label, dtype: int64

In [None]:
val_df.label.value_counts()

2    231
3    174
1    166
0    153
Name: label, dtype: int64

Now we need to create the data generators. A training and validation data generators to be precise. Since we are dealing with large images (512, 512, 3) we cannot read them into memory. We will use the tensorflow ImageDataGenerator class, which will also handle data augmentation for us. Because we do not have the image tensors in memory, we will use the **flow_from_directory** method. However, before that we will need to save the images in separate folders, depending on what class label they have been assigned. 

In [None]:
# do not touch the original data, we will create a separate directory for this
# the plan is to go through every image file in the dataframe, open it and save it in a new location
# following the pattern grouped_images/<class_label>/<image_name>.jpg

def group_images(dataframe, split_name: str):
    # use the path module for easier manipulation of directories
    grouped_images_dir = Path(f"grouped_images_{split_name}")

    # only do this if the directory does not exist
    # otherwise assume this has already been done
    if grouped_images_dir.exists() is False:
        grouped_images_dir.mkdir()

        # iterate over each image and save in location as described above
        for i, row in dataframe.iterrows():
            print(f"{i} / {len(dataframe)}", end="\r")
            filename = row.filename
            label = str(int(row.label))
            save_dir = grouped_images_dir / label
            if save_dir.exists() is False:
                save_dir.mkdir()
            
            img = image.load_img(train_images_dir / filename)
            img.save(save_dir / filename)
    print("Stored images in {grouped_images_dir}.")

In [None]:
group_images(train_df, "train")
group_images(val_df, "val")



























































































Done.
Done.


Now we instantiate the data generator with the data augmentations, such as horizontal and vertical flipping,
zoom, rotation and etc.

In [None]:
# Create a Data Generator
data_gen_args = dict( 
    rotation_range=10.,
    width_shift_range=0.05,
    height_shift_range=0.05,
    zoom_range=0.2,
    channel_shift_range=0.05,
    horizontal_flip=True,
    vertical_flip=True,
    fill_mode='constant',
    data_format="channels_last",
)

image_datagen = ImageDataGenerator(**data_gen_args)

train_gen = image_datagen.flow_from_directory(
    "grouped_images_train",
    target_size=(512, 512),
    color_mode="rgb",
    batch_size=16
)

val_gen = image_datagen.flow_from_directory(
    "grouped_images_val",
    target_size=(512, 512),
    color_mode="rgb",
    batch_size=16
)

Found 1734 images belonging to 4 classes.
Found 724 images belonging to 4 classes.


# Building The Model

For the model we will go with something similar to the AlexNet we built in class with some minor modifications. Namely increasing the number of filters and adding additional dense layers with non-linear activations at the top of the model.

In [None]:
class AlexNet(Sequential):
   def __init__(self, input_shape, num_classes):
    super().__init__()

    self.add(layers.Conv2D(128, kernel_size=(11,11), strides= 4,
                    padding= 'valid', activation= 'relu',
                    input_shape= input_shape, kernel_initializer= 'he_normal'))
    self.add(layers.BatchNormalization())
    self.add(layers.MaxPooling2D(pool_size=(3,3), strides= (2,2),
                          padding= 'valid', data_format= None))
    
    
    self.add(layers.Conv2D(256, kernel_size=(5,5), strides= 1,
                    padding= 'same', activation= 'relu',
                    kernel_initializer= 'he_normal'))
    self.add(layers.BatchNormalization())
    self.add(layers.MaxPooling2D(pool_size=(3,3), strides= (2,2),
                          padding= 'valid', data_format= None)) 
    

    self.add(layers.Conv2D(512, kernel_size=(3,3), strides= 1,
                    padding= 'same', activation= 'relu',
                    kernel_initializer= 'he_normal'))
    self.add(layers.BatchNormalization())
    
    self.add(layers.Conv2D(512, kernel_size=(3,3), strides= 1,
                    padding= 'same', activation= 'relu',
                    kernel_initializer= 'he_normal'))
    self.add(layers.BatchNormalization())
    
    self.add(layers.Conv2D(512, kernel_size=(3,3), strides= 1,
                    padding= 'same', activation= 'relu',
                    kernel_initializer= 'he_normal'))
    self.add(layers.BatchNormalization())
    
    self.add(layers.MaxPooling2D(pool_size=(3,3), strides= (2,2),
                          padding= 'valid', data_format= None))
    

    self.add(layers.Flatten())

    # add non-linear activations to increase model complexity
    self.add(layers.Dense(512, activation="relu"))
    self.add(layers.Dense(512, activation="relu"))
    self.add(layers.Dense(512))

    self.add(layers.Dense(num_classes, activation='softmax'))

    self.compile(optimizer= tf.keras.optimizers.Adam(learning_rate=0.01),
                loss='categorical_crossentropy',
                metrics=['accuracy'])

In [None]:
# instantiate the model and view the summary
alex_net = AlexNet((512, 512, 3), 4)
alex_net.summary()

Model: "alex_net"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 126, 126, 96)      34944     
_________________________________________________________________
batch_normalization (BatchNo (None, 126, 126, 96)      384       
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 62, 62, 96)        0         
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 62, 62, 256)       614656    
_________________________________________________________________
batch_normalization_1 (Batch (None, 62, 62, 256)       1024      
_________________________________________________________________
max_pooling2d_1 (MaxPooling2 (None, 30, 30, 256)       0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 30, 30, 384)       885

In [None]:
# plot the model so we can visualize what is going on
# tf.keras.utils.plot_model(alex_net, "alex_net.png", show_dtype=True, show_layer_names=True, show_shapes=True)

# Training The Model

In [None]:
alex_net.fit(train_gen, epochs=10, validation_data=val_gen)



KeyboardInterrupt: 

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=0b0632bc-61bc-48c3-a5dd-6b5d85e1e005' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>