# IMPORT ALL REQUIRED LIBRARIES

In [3]:
import os
import shutil
import glob
import json
import pickle
import time
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import cv2
import skimage.io as io
from skimage.transform import resize

from sklearn.preprocessing import MinMaxScaler, Normalizer, StandardScaler, RobustScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score, cross_validate
from sklearn.utils import shuffle
from sklearn.metrics import (confusion_matrix, classification_report, accuracy_score, f1_score, recall_score, precision_score,
                             auc, roc_curve, roc_auc_score, cohen_kappa_score, plot_confusion_matrix, plot_roc_curve,
                             plot_precision_recall_curve, precision_recall_fscore_support, precision_recall_curve)
from sklearn.utils import compute_class_weight
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC 
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier


import tensorflow as tf
from tensorflow.keras import backend as K
from tensorflow.keras.preprocessing.image import load_img, img_to_array, array_to_img, ImageDataGenerator, save_img
from tensorflow.keras.preprocessing import image_dataset_from_directory
from tensorflow.keras.models import Sequential, Model, load_model, model_from_json
from tensorflow.keras.layers import (Input, InputLayer, Conv2D, MaxPooling2D, Dense, BatchNormalization, Dropout, Activation, GlobalAveragePooling2D, InputSpec, Flatten, Concatenate)
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.applications.vgg19 import VGG19, preprocess_input
from tensorflow.keras.applications.resnet_v2 import ResNet101V2, preprocess_input
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
from tensorflow.keras.optimizers import Adam, SGD, RMSprop, Adadelta, Adagrad
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.metrics import Accuracy, AUC, Precision, Recall
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier

import warnings
warnings.filterwarnings("ignore")

!pip install tensorflow_addons
import tensorflow_addons as tfa

In [4]:
SEED = 42
np.random.seed(SEED)
random.seed(SEED)
tf.random.set_seed(SEED)
pd.set_option('display.max_colwidth' , 200)

# DATA ANALYSIS

In [5]:
BASE_DIR = "../input/newdsaug/newds/"
VAL_DIR , TEST_DIR , TRAIN_DIR = [os.path.join(BASE_DIR , sub_dir) for sub_dir in os.listdir(BASE_DIR)]

print(TRAIN_DIR)
print(VAL_DIR)
print(TEST_DIR)

In [6]:
classes = os.listdir(TRAIN_DIR)
class2label = {}

for i in range(len(classes)):
    class2label[classes[i]] = i
    
label2class = {v:k for k, v in class2label.items()}

print(class2label)
print("--"*45)
print(label2class)

In [7]:
train_distribution = {}
print("TRAIN-SET DISTRIBUTION\n")
for cat in classes:
    path = os.path.join(TRAIN_DIR, cat, "*")
    data = glob.glob(path)
    print(f"Number of {cat} Images: {len(data)}")
    train_distribution[cat] = len(data)
    print('--'*20)   

test_distribution = {}
print("\nVAL-SET DISTRIBUTION\n")
for cat in classes:
    path = os.path.join(VAL_DIR, cat, "*")
    data = glob.glob(path)
    print(f"Number of {cat} Images: {len(data)}")
    test_distribution[cat] = len(data)
    print('--'*20)
    
test_distribution = {}
print("\nTEST-SET DISTRIBUTION\n")
for cat in classes:
    path = os.path.join(TEST_DIR, cat, "*")
    data = glob.glob(path)
    print(f"Number of {cat} Images: {len(data)}")
    test_distribution[cat] = len(data)
    print('--'*20)

In [8]:
categories = []
image_name = []
image_id = []
image_format = []
labels = []
img_path = []
img_height = []
img_width = []

for cat in classes:
    path = os.path.join(TRAIN_DIR, cat, "*")
    data = glob.glob(path)
    for i in range(len(data)):
        assert not isinstance(data,type(None))
        h, w, c = cv2.imread(data[i]).shape
        cls , img = data[i].split('/')[-2:]
        img_id , img_format = img.split('.')
        img_height.append(h)
        img_width.append(w)
        categories.append(cls)
        image_name.append(img)
        image_id.append(img_id)
        image_format.append(img_format) 
        labels.append(class2label[cls])
        img_path.append(data[i])

In [9]:
assert len(categories) == len(image_name) == len(image_id) == len(image_id) == len(image_format) == len(labels) == len(img_path) == len(img_height) == len(img_width)

In [10]:
data = pd.DataFrame(list(zip(image_id, image_name, image_format, categories, labels, img_path, img_height, img_width)))
data.columns = ['image_id' , 'image_name' , 'format' , 'Class' , 'label' , 'image_path' , 'height' , 'width']
data.sample(10 , random_state = SEED)

In [11]:
data.shape

In [12]:
data['Class'].value_counts()

In [13]:
if data.isnull().sum().sum() == 0:
    print('No Null Values found in dataset')
else:
    print(f'{data.isnull().sum().sum()} null values found in dataset')

# checking for any duplicate values
if data.duplicated().sum() == 0:
    print('No Duplicate Values found in dataset')
else:
    print(f'{data.duplicated().sum()} Duplicate values found in dataset')

# plot figures - image information

In [14]:
plt.figure(figsize=(8, 8))

sns.countplot(data = data , x = 'Class' , order = data['Class'].value_counts().index)
sns.despine(top=True, right=True, left=True, bottom=False)

plt.xticks(ticks= [0,1,2,3], labels=data['Class'].value_counts().index.to_list(), fontsize=12, rotation=340)
plt.yticks([])
plt.title('Number of Samples per Class' , fontsize=15 , pad=20 , fontstyle='italic' , fontweight = 800)
plt.xlabel('Type of Diagnosis' , fontsize=14 , fontweight=400, labelpad=16)
plt.ylabel('')
ax = plt.gca()

for p in ax.patches:
    ax.annotate("%.1f%%" % (100*float(p.get_height()/data.shape[0])), (p.get_x() + p.get_width() / 2., abs(p.get_height())), ha='center', 
                va='bottom', color='black', xytext=(0,2), rotation='horizontal', textcoords='offset points', fontsize = 12, fontstyle = 'italic')

#plt.savefig("samples.png",dpi=720)
plt.tight_layout()

In [15]:
sample_data = data.sample(12 , random_state = SEED)
sample_imgs = sample_data['image_path'].to_list()

plt.figure(figsize = (12,12))
for i in range(12):
    plt.subplot(4 , 3 , i+1)
    img = cv2.imread(sample_imgs[i])
    plt.title(sample_imgs[i].split('/')[-2], fontsize = 12)
    plt.axis('off')
    plt.imshow(img)
    
plt.savefig("brainPlots.png",dpi=720)

In [16]:
print('Shape of the image : {}'.format(img.shape))
print('Image Height: {}'.format(img.shape[0]))
print('Image Width: {}'.format(img.shape[1]))
print('Image Dimensions/Channels: {}'.format(img.ndim))
print("=="*20)
print('Image size: {}'.format(img.size))
print('Image Data Type: {}'.format(img.dtype))
print("=="*20)
print('Maximum RGB value in this image {}'.format(img.max()))
print('Minimum RGB value in this image {}'.format(img.min()))

In [17]:
mean_val = []
std_val = []
max_val = []
min_val = []

for i in range(data.shape[0]):
    img_arr = img_to_array(load_img(data['image_path'][i]))
    mean_val.append(img_arr.mean())
    std_val.append(img_arr.std())
    max_val.append(img_arr.max())
    min_val.append(img_arr.min())

data['mean_pixel'] = mean_val
data['std_dev'] = std_val
data['max_pixel'] = max_val
data['min_pixel'] = min_val

In [18]:
data['mean_variation'] = data['mean_pixel'].mean() - data['mean_pixel']
data.sample(10)

In [19]:
plt.figure(figsize = (8,5))
sns.violinplot(y = 'mean_pixel' , data = data , x = 'Class')
sns.despine()
plt.xticks(rotation=340)
plt.title('Mean value distribution for all Classes')
#plt.savefig("meanValue.png",dpi=720)
plt.show()

In [20]:
plt.figure(figsize = (12, 8))
sns.kdeplot(data = data , x = 'mean_pixel' , hue = 'Class' , shade = True , linewidth = 2.5)
plt.title('Image color mean value distribution')
#plt.savefig("colorMean.png",dpi=720)
plt.show()

In [21]:
plt.figure(figsize = (8,5))
sns.kdeplot(data = data , x = 'max_pixel', hue='Class' , linewidth = 2.5)
plt.title('Image color max value distribution by class')
#plt.savefig("ImagecolorMaxValueDistribution.png",dpi=720)
plt.show()

In [22]:
sns.set(rc={'figure.figsize':(16,8)})
fig = sns.relplot(data=data, x="mean_pixel", y=data['std_dev'],alpha=0.8, kind='scatter',col='Class');
sns.despine(top=True, right=True, left=False, bottom=False)

plt.suptitle('Mean and Standard Deviation of Image Samples', fontsize = 16 , fontweight = 550)
fig.tight_layout(pad=3);
#plt.savefig("MeanAndStdOfImageSamples.png",dpi=720)


In [23]:
plt.figure(figsize=(20,8))
sns.set(style="ticks", font_scale = 1)
ax = sns.scatterplot(data=data, x="mean_pixel", y=data['std_dev'], hue = 'Class',alpha=0.8);
sns.despine(top=True, right=True, left=False, bottom=False)
plt.xticks(rotation=0,fontsize = 12)
ax.set_xlabel('Image Channel Colour Mean',fontsize = 14,weight = 'bold')
ax.set_ylabel('Image Channel Colour Standard Deviation',fontsize = 14,weight = 'bold')
plt.title('Mean and Standard Deviation of Image Samples', fontsize = 16,weight = 'bold');
#plt.savefig("MeanAndStdOfImages2.png",dpi=720)


# Hyperparameters and Model Training

In [24]:
BATCH_SIZE = 64
NUM_EPOCHS = 75
IMG_SIZE = (224,224)
INPUT_SHAPE = (224,224,3)

In [25]:
train_datagen = ImageDataGenerator(rescale = 1./255,
                                   horizontal_flip = True,
                                   brightness_range = [0.6, 1.4],
                                   rotation_range = 15)

val_datagen = ImageDataGenerator(rescale = 1./255)

test_datagen  = ImageDataGenerator(rescale = 1./255)

In [26]:
train_dataset  = train_datagen.flow_from_directory(directory = TRAIN_DIR,
                                                   target_size = IMG_SIZE,
                                                   color_mode = "rgb",
                                                   class_mode = "categorical",
                                                   batch_size = BATCH_SIZE,
                                                   interpolation = "bicubic" ,
                                                   shuffle = True)    

val_dataset = val_datagen.flow_from_directory(directory = VAL_DIR,
                                               target_size = IMG_SIZE,
                                               color_mode = "rgb",
                                               class_mode = "categorical",
                                               batch_size = BATCH_SIZE,
                                               interpolation = "bicubic")

test_dataset = test_datagen.flow_from_directory(directory = TEST_DIR,
                                               target_size = IMG_SIZE,
                                               color_mode = "rgb",
                                               class_mode = "categorical",
                                               batch_size = BATCH_SIZE,
                                               interpolation = "bicubic",
                                               shuffle = False)

In [27]:
print('Number of batch iterations on training data' , len(train_dataset))
print('Number of batch iterations on validation data' , len(val_dataset))
print('Number of batch iterations on test data' , len(test_dataset))

In [28]:
label2class = train_dataset.class_indices
class2label = {v:k for (k,v) in train_dataset.class_indices.items()}

print(label2class)
print(class2label)

In [29]:
os.mkdir("./temp/")
checkpoint_filepath = "./temp/"

In [38]:
optimizer = SGD(learning_rate=0.001, momentum=0.9, nesterov=True, decay=1e-6)
metrics = ["accuracy", Recall(), Precision(), AUC()]

#early_stop = EarlyStopping(monitor='val_loss', patience=8, mode='min', verbose=1)
checkpoint = ModelCheckpoint(filepath=checkpoint_filepath, monitor="val_loss", verbose=1, save_best_only=True, mode="min")
lr_reduction = ReduceLROnPlateau(monitor="val_loss", factor=0.1, patience=5, verbose=1, mode="auto")
callbacks_list = [lr_reduction, checkpoint]

In [39]:
def categorical_focal_loss(gamma=2.0, alpha=0.25):
    """
    Implementation of Focal Loss from the paper in multiclass classification
    Formula:
        loss = -alpha*((1-p)^gamma)*log(p)
    Parameters:
        alpha -- the same as wighting factor in balanced cross entropy
        gamma -- focusing parameter for modulating factor (1-p)
    Default value:
        gamma -- 2.0 as mentioned in the paper
        alpha -- 0.25 as mentioned in the paper
    """
    def focal_loss(y_true, y_pred):
        # Define epsilon so that the backpropagation will not result in NaN
        # for 0 divisor case
        epsilon = K.epsilon()
        # Add the epsilon to prediction value
        #y_pred = y_pred + epsilon
        # Clip the prediction value
        y_pred = K.clip(y_pred, epsilon, 1.0-epsilon)
        # Calculate cross entropy
        cross_entropy = -y_true*K.log(y_pred)
        # Calculate weight that consists of  modulating factor and weighting factor
        weight = alpha * y_true * K.pow((1-y_pred), gamma)
        # Calculate focal loss
        loss = weight * cross_entropy
        # Sum the losses in mini_batch
        loss = K.sum(loss, axis=1)
        return loss
    
    return focal_loss

# VGG16

In [41]:
base_model = VGG16(input_shape=(224, 224, 3), include_top=False, weights="imagenet")
x = base_model.output
x = Dropout(0.25)(x)
x = Flatten()(x)

x = Dense(4096, kernel_initializer='he_uniform')(x)
x = Activation('relu')(x)
x = Dropout(0.25)(x)
x = Dense(4096, kernel_initializer='he_uniform')(x)
x = Activation('relu')(x)
x = Dropout(0.25)(x)

x = Dense(1000, kernel_initializer='he_uniform')(x)
x = Activation('relu')(x)
x = Dropout(0.25)(x)

output = Dense(4, activation='softmax')(x)
model = Model(inputs=base_model.input, outputs=output, name="VGG16_Network")

In [42]:
model.summary()

In [34]:
plot_model(model, 'finalModel.png', show_shapes=False,dpi=720)


In [43]:
model.compile(optimizer = optimizer, loss = "categorical_crossentropy", metrics = metrics)

In [44]:
hist = model.fit(x=train_dataset, epochs=NUM_EPOCHS, callbacks=[callbacks_list], steps_per_epoch=len(train_dataset),
                 validation_data=val_dataset, validation_steps=len(val_dataset))

In [47]:
metricsEval = list(hist.history.keys())
print(metricsEval , '\n')

historyDF = pd.DataFrame(hist.history)
historyDF.head()
historyDF.to_csv('mycsvfile.csv',index=False)

In [49]:
fig, axarr = plt.subplots(1,3, figsize=(21, 8), sharex=True)

sns.set(style="ticks", font_scale = 1)
sns.despine(top=True, right=True, left=False, bottom=False)

ax = sns.lineplot(x = historyDF.index, y = hist.history[metricsEval[0]], ax=axarr[0], label="Training");
ax = sns.lineplot(x = historyDF.index, y = hist.history[metricsEval[5]], ax=axarr[0], label="Validation");
ax.set_ylabel('Loss')

ax = sns.lineplot(x = historyDF.index, y = hist.history[metricsEval[1]], ax=axarr[1], label="Training");
ax = sns.lineplot(x = historyDF.index, y = hist.history[metricsEval[6]], ax=axarr[1], label="Validation");
ax.set_ylabel('Accuracy')

ax = sns.lineplot(x = historyDF.index, y = hist.history[metricsEval[10]], ax=axarr[2]);
ax.set_ylabel('Learning Rate')

axarr[0].set_title('Training and Validation Loss', fontsize=17)
axarr[1].set_title('Training and Validation Accuracy', fontsize=17)
axarr[2].set_title('LR during training', fontsize=17)

for ax in axarr:
    ax.set_xlabel('Epochs')

plt.suptitle('Training and Validation Performance plots', fontsize=19, weight='bold');
fig.tight_layout(pad=3.0)
plt.savefig("TR-val_auto.png",dpi=1080)
plt.show()


In [51]:
fig, axarr = plt.subplots(1,3, figsize=(21, 8), sharex=True)

sns.set(style="ticks", font_scale = 1)
sns.despine(top=True, right=True, left=False, bottom=False)

ax = sns.lineplot(x = historyDF.index, y = hist.history[metricsEval[3]], ax=axarr[0], label="Training");
ax = sns.lineplot(x = historyDF.index, y = hist.history[metricsEval[8]], ax=axarr[0], label="Validation");
ax.set_ylabel('Precision')

ax = sns.lineplot(x = historyDF.index, y = hist.history[metricsEval[2]], ax=axarr[1], label="Training");
ax = sns.lineplot(x = historyDF.index, y = hist.history[metricsEval[7]], ax=axarr[1], label="Validation");
ax.set_ylabel('Recall')

ax = sns.lineplot(x = historyDF.index, y = hist.history[metricsEval[4]], ax=axarr[2], label="Training");
ax = sns.lineplot(x = historyDF.index, y = hist.history[metricsEval[9]], ax=axarr[2], label="Validation");
ax.set_ylabel('AUC')

axarr[0].set_title('Training and Validation Precision', fontsize=17)
axarr[1].set_title('Training and Validation Recall', fontsize=17)
axarr[2].set_title('Training and Validation AUC', fontsize=17)

for ax in axarr:
    ax.set_xlabel('Epochs')

plt.suptitle('Training and Validation Performance plots', fontsize=19, weight='bold');
fig.tight_layout(pad=3.0)
plt.savefig("metrics_auto.png",dpi=1080)
plt.show()

In [52]:
loss , acc , recall , precision , auc = model.evaluate(train_dataset)
print("\n---------------- Evaluation on Train DataSet ----------------\n")
print(f'Loss on Train set: {loss:.4f}')
print(f'Accuracy on Train set: {acc*100:.2f}%')
print(f'Recall on Train set: {recall:.2f}')
print(f'Precision on Train set: {precision:.2f}')
print(f'AUC on Train set: {auc:.2f}')

In [53]:
loss , acc , recall , precision , auc = model.evaluate(val_dataset)
print("\n---------------- Evaluation on Validation DataSet ----------------\n")
print(f'Loss on Validation set: {loss:.2f}')
print(f'Accuracy on Validation set: {acc*100:.2f}%')
print(f'Recall on Validation set: {recall:.2f}')
print(f'Precision on Validation set: {precision:.2f}')
print(f'AUC on Validation set: {auc:.2f}')

In [54]:
loss , acc , recall , precision , auc = model.evaluate(test_dataset)
print("\n---------------- Evaluation on Test DataSet ----------------\n")
print(f'Loss on Test set: {loss:.2f}')
print(f'Accuracy on Test set: {acc*100:.2f}%')
print(f'Recall on Test set: {recall:.2f}')
print(f'Precision on Test set: {precision:.2f}')
print(f'AUC on Test set: {auc:.2f}')

In [55]:
y_pred = model.predict(test_dataset)
predictions = np.array(list(map(lambda x: np.argmax(x), y_pred)))
y_true = test_dataset.classes
cr_indexes = list(class2label.keys())
classes = list(class2label.values())
print(classification_report(y_true, predictions, labels = cr_indexes, target_names = classes))

In [60]:
CMatrix = pd.DataFrame(confusion_matrix(y_true, predictions), columns = classes, index = classes)

plt.figure(figsize=(10, 6))
ax = sns.heatmap(CMatrix, annot = True, fmt = 'g' ,vmin = 0, vmax = 250,cmap = 'YlGnBu')
ax.set_xlabel('Predicted',fontsize = 14,weight = 'bold')
ax.set_xticklabels(ax.get_xticklabels(),rotation =0);
ax.set_ylabel('Actual',fontsize = 14,weight = 'bold') 
ax.set_yticklabels(ax.get_yticklabels(),rotation =0);
ax.set_title('Confusion Matrix - Test Set', fontsize = 19, weight = 'bold', pad=20);
plt.savefig("confusion_matrix.png",dpi=1080)

In [61]:
acc = accuracy_score(y_true, predictions)

results_all = precision_recall_fscore_support(y_true, predictions, average='macro', zero_division=1)
results_class = precision_recall_fscore_support(y_true, predictions, average=None, zero_division=1)

metric_columns = ['Precision','Recall', 'F1-Score','S']
all_df = pd.concat([pd.DataFrame(list(results_class)).T,pd.DataFrame(list(results_all)).T])
all_df.columns = metric_columns
all_df.index = classes+['Total']

def metrics_plot(df,metric):
    plt.figure(figsize=(12,6))
    ax = sns.barplot(data=df, x=df.index, y=metric, palette = "Blues_d")
    #Bar Labels
    for p in ax.patches:
        ax.annotate("%.1f%%" % (100*p.get_height()), (p.get_x() + p.get_width() / 2., abs(p.get_height())),
        ha='center', va='bottom', color='black', xytext=(-3, 5),rotation = 'horizontal',textcoords='offset points')
    sns.despine(top=True, right=True, left=True, bottom=False)
    ax.set_xlabel('Class',fontsize = 14,weight = 'bold')
    ax.set_ylabel(metric,fontsize = 14,weight = 'bold')
    ax.set(yticklabels=[])
    ax.axes.get_yaxis().set_visible(False) 
    plt.title(metric+ ' Results per Class', fontsize = 16,weight = 'bold');

metrics_plot(all_df, 'Precision')       # Results by Class
metrics_plot(all_df, 'Recall')          # Results by Class
metrics_plot(all_df, 'F1-Score')         # Results by Class
print('----------------- Overall Results -----------------')
print('Accuracy Result: %.2f%%'%(acc*100))                   # Accuracy of the whole Dataset
print('Precision Result: %.2f%%'%(all_df.iloc[4,0]*100))     # Precision of the whole Dataset
print('Recall Result: %.2f%%'%(all_df.iloc[4,1]*100))        # Recall of the whole Dataset