<a href="https://colab.research.google.com/github/emmex2000/Applied-Statistics-Course-Work/blob/main/VGGNet.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
import shutil
import urllib
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
% matplotlib inline

In [None]:
DIR = "drive/MyDrive/Thorax Disease Classification"
os.chdir(DIR)

In [None]:
df = pd.read_csv("df.csv")
df.head()

Unnamed: 0,Image Index,Effusion,No Finding,Pneumonia,Pneumothorax
0,00006022_006.png,0,1,0,0
1,00020953_001.png,1,0,0,0
2,00029052_010.png,1,0,0,1
3,00014125_054.png,1,0,0,0
4,00016987_010.png,1,0,0,0


In [None]:
## split data into train and test
n_train = int(df.shape[0] * .7)
train = df.iloc[:n_train]
test = df.iloc[n_train:]

In [None]:
train.shape

(17680, 5)

In [None]:
col = train.columns[1:].tolist()
col

['Effusion', 'No Finding', 'Pneumonia', 'Pneumothorax']

In [None]:
import tensorflow as tf
from keras import backend as K 
from keras_preprocessing.image import ImageDataGenerator
import keras

In [None]:
datagen = ImageDataGenerator(rescale=1./255., validation_split=0.2)



In [None]:
train_generator = datagen.flow_from_dataframe(dataframe=train,
                                            directory="data/images/",
                                            x_col="Image Index",
                                            y_col=col,
                                            subset="training",
                                            batch_size=128,
                                            seed=42,
                                            shuffle=True,
                                            class_mode="raw",
                                            target_size=(224, 224))



Found 9355 validated image filenames.


  .format(n_invalid, x_col)


In [None]:
valid_generator = datagen.flow_from_dataframe(dataframe=train,
                                            directory="data/images/",
                                            x_col="Image Index",
                                            y_col=col,
                                            subset="validation",
                                            batch_size=128,
                                            seed=42,
                                            shuffle=True,
                                            class_mode="raw",
                                            target_size=(224, 224))



Found 2338 validated image filenames.


  .format(n_invalid, x_col)


In [None]:
test_datagen = ImageDataGenerator(rescale=1./255.)
test_generator = test_datagen.flow_from_dataframe(dataframe=test,
                                            directory="data/images/",
                                            x_col="Image Index",
                                            y_col=col,
                                            batch_size=32,
                                            seed=42,
                                            shuffle=False,
                                            class_mode="raw",
                                            target_size=(224, 224))

Found 5016 validated image filenames.


  .format(n_invalid, x_col)


In [None]:
IMG_SIZE = (224, 224)
IMG_SHAPE = IMG_SIZE + (3,)
IMG_SHAPE

(224, 224, 3)

In [None]:
base_model = tf.keras.applications.vgg16.VGG16(input_shape=IMG_SHAPE,
                                               include_top=False,
                                               weights='imagenet')

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5


In [None]:
image_batch, label_batch = next(iter(train_generator))
feature_batch = base_model(image_batch)
print(feature_batch.shape)

(128, 7, 7, 512)


In [None]:
# Let's take a look at the base model architecture
base_model.summary()

Model: "vgg16"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 224, 224, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 224, 224, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 112, 112, 64)      0         
                                                                 
 block2_conv1 (Conv2D)       (None, 112, 112, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 112, 112, 128)     147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 56, 56, 128)       0     

In [None]:
global_average_layer = tf.keras.layers.GlobalAveragePooling2D()
feature_batch_average = global_average_layer(feature_batch)
print(feature_batch_average.shape)

(128, 512)


In [None]:
prediction_layer = tf.keras.layers.Dense(4)
prediction_batch = prediction_layer(feature_batch_average)
print(prediction_batch.shape)

(128, 4)


In [None]:
base_model.trainable = True

In [None]:
inputs = tf.keras.Input(shape=(IMG_SHAPE))
x = base_model(inputs, training=False)
x = global_average_layer(x)
x = tf.keras.layers.Dropout(0.2)(x)
outputs = prediction_layer(x)
model = tf.keras.Model(inputs, outputs)

In [None]:
# Let's take a look to see how many layers are in the base model
print("Number of layers in the base model: ", len(base_model.layers))

# Fine-tune from this layer onwards
fine_tune_at = 100

# Freeze all the layers before the `fine_tune_at` layer
for layer in base_model.layers[:fine_tune_at]:
  layer.trainable =  False

Number of layers in the base model:  19


In [None]:
# custom loss

POS_WEIGHT = 10  # multiplier for positive targets, needs to be tuned


def wce(target, output):
    """
    Weighted binary crossentropy between an output tensor 
    and a target tensor. POS_WEIGHT is used as a multiplier 
    for the positive targets.

    Combination of the following functions:
    * keras.losses.binary_crossentropy
    * keras.backend.tensorflow_backend.binary_crossentropy
    * tf.nn.weighted_cross_entropy_with_logits
    """
    # print(f"target: {target}")
    # print(f"output: {output}")
    # transform back to logits
    output = K.clip(output, K.epsilon(), 1 - K.epsilon())
    output = tf.math.log(output / (1 - output))

    # output = tf.constant(output)
    # target = tf.constant(target)
    labels = tf.cast(tf.reshape(target , [-1, 4]), dtype=tf.float32)

    # compute weighted loss
    loss = tf.nn.weighted_cross_entropy_with_logits(labels=labels,
                                                    logits=output,
                                                    pos_weight=POS_WEIGHT)
   # loss = tf.where(tf.is_nan(loss), tf.zeros_like(loss), loss)
    return K.mean(loss)

# def wce(target, output):
   
#     # transform back to logits
#     output = K.clip(output, K.epsilon(), 1 - K.epsilon())
#     output = tf.math.log(output / (1 - output))

#     # output = tf.constant(output)
#     # target = tf.constant(target)
#     labels = tf.cast(tf.reshape(target , [-1, 4]), dtype=tf.float32)

#     # compute weighted loss
#     loss = tf.reduced_mean(tf.nn.weighted_cross_entropy_with_logits(labels=labels, logits=output, pos_weight=POS_WEIGHT))
#     return loss

In [None]:
model1 = keras.models.clone_model(model)

In [None]:
#COMPILATION OF MODEL ARCHITECTURE

base_learning_rate = 0.0001
model.compile(loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
              optimizer = tf.keras.optimizers.Adam(lr=base_learning_rate/10),
              metrics=['accuracy'])

  super(Adam, self).__init__(name, **kwargs)


In [None]:
model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 vgg16 (Functional)          (None, 7, 7, 512)         14714688  
                                                                 
 global_average_pooling2d (G  (None, 512)              0         
 lobalAveragePooling2D)                                          
                                                                 
 dropout (Dropout)           (None, 512)               0         
                                                                 
 dense (Dense)               (None, 4)                 2052      
                                                                 
Total params: 14,716,740
Trainable params: 2,052
Non-trainable params: 14,714,688
_____________________________________________

In [None]:
len(model.trainable_variables)

2

In [None]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


In [None]:
# #AFTER COMPILING THE MODEL, THE MODEL IS TRAINED BELOW
# callbacks = [
#     # tf.keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True,),
#     tf.keras.callbacks.TensorBoard(log_dir='./logs/VGGlogs'),
#     # tf.keras.callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.1, patience=3, verbose=0, min_delta=0.0001, min_lr=0, ),
    
# ]

# STEP_SIZE_TRAIN = train_generator.n//train_generator.batch_size
# STEP_SIZE_VALID = valid_generator.n//valid_generator.batch_size
# STEP_SIZE_TEST = test_generator.n//test_generator.batch_size

# history = model.fit_generator(generator=train_generator, steps_per_epoch=STEP_SIZE_TRAIN,
#                     validation_data=valid_generator,  validation_steps=STEP_SIZE_VALID,
#                     epochs=100, verbose=1, callbacks=callbacks)

In [None]:
# #os.mkdir("models")
# model.save("models/VGG-model.h5")

In [None]:
test_label = test_generator.labels

In [None]:
model = keras.models.load_model("models/VGG-model.h5", compile=False)

In [None]:
prob = model.predict_generator(test_generator)
prob

  """Entry point for launching an IPython kernel.


array([[ 1.5050982 ,  0.79469734, -0.7464331 ,  0.6723772 ],
       [ 1.633309  ,  0.6530788 , -1.0077466 ,  0.43950802],
       [ 1.5164591 ,  0.6179038 , -0.86806506,  0.29752272],
       ...,
       [ 1.3325318 ,  0.9076812 , -0.83797073,  0.61440784],
       [ 1.4823662 ,  0.58587015, -0.8549471 ,  0.3671271 ],
       [ 1.5349084 ,  0.35958666, -0.76027036,  0.35115814]],
      dtype=float32)

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
roc_auc_score(test_label, prob)

0.5862350278902239

In [None]:
roc_auc_score(test_label[:, 0], prob[:, 0])

0.6570818071304729

In [None]:
roc_auc_score(test_label[:, 1], prob[:, 1])

0.6603799767152039

In [None]:
roc_auc_score(test_label[:, 2], prob[:, 2])

0.462857056402808

In [None]:
roc_auc_score(test_label[:, 3], prob[:, 3])

0.5646212713124106

In [None]:
base_learning_rate = 0.0001
model1.compile(loss=wce,
               optimizer = tf.keras.optimizers.Adam(lr=base_learning_rate/10),
               metrics=['accuracy'])

  super(Adam, self).__init__(name, **kwargs)


In [None]:
 #AFTER COMPILING THE MODEL, THE MODEL IS TRAINED BELOW
callbacks = [
    # tf.keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True,),
    tf.keras.callbacks.TensorBoard(log_dir='./logs/VGGlogs1' , histogram_freq=1),
    # tf.keras.callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.1, patience=3, verbose=0, min_delta=0.0001, min_lr=0, ),
    
]
STEP_SIZE_TRAIN = train_generator.n//train_generator.batch_size
STEP_SIZE_VALID = valid_generator.n//valid_generator.batch_size
STEP_SIZE_TEST = test_generator.n//test_generator.batch_size

history = model1.fit_generator(generator=train_generator, steps_per_epoch=STEP_SIZE_TRAIN,
                    validation_data=valid_generator,  validation_steps=STEP_SIZE_VALID,
                    epochs=100, verbose=1)

  


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [None]:
model1.save("models/VGG-model1.h5")

In [None]:
model1 = keras.models.load_model("models/VGG-model1.h5", compile=False)
prob1 = model1.predict_generator(test_generator)

  


In [None]:
roc_auc_score(test_label[:, 0], prob1[:, 0])

0.4406138823013611

In [None]:
roc_auc_score(test_label[:, 1], prob1[:, 1])

0.613456808343172

In [None]:
roc_auc_score(test_label[:, 2], prob1[:, 2])

0.40933806426489006

In [None]:
roc_auc_score(test_label[:, 3], prob1[:, 3])

0.5225354516793382