In [1]:
import os

from datetime import datetime

import json

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf

from google.cloud import storage

from typing import *

2023-05-20 15:22:01.874836: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-05-20 15:22:06.489775: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-05-20 15:22:15.538932: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory; LD_LIBRARY_PATH: /usr/local/cuda/lib64:/usr/local/cuda/lib:/usr/local/lib/x86_64-linux-gnu:/usr/local/nvidia/lib:/usr/local/nvidia/lib64:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
2023-05-20 15:22:15.539228: W tensorflow/strea

In [4]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  1


2023-05-20 15:24:55.455012: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-05-20 15:24:56.091040: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-05-20 15:24:56.092873: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero


In [12]:
!conda list

# packages in environment at /opt/conda:
#
# Name                    Version                   Build  Channel
_libgcc_mutex             0.1                 conda_forge    conda-forge
_openmp_mutex             4.5                       2_gnu    conda-forge
absl-py                   1.4.0                    pypi_0    pypi
aiohttp                   3.8.4                    pypi_0    pypi
aiohttp-cors              0.7.0                    pypi_0    pypi
aiorwlock                 1.3.0                    pypi_0    pypi
aiosignal                 1.3.1              pyhd8ed1ab_0    conda-forge
ansiwrap                  0.8.4                    pypi_0    pypi
anyio                     3.6.2              pyhd8ed1ab_0    conda-forge
apache-beam               2.46.0                   pypi_0    pypi
argon2-cffi               21.3.0             pyhd8ed1ab_0    conda-forge
argon2-cffi-bindings      21.2.0           py37h540881e_2    conda-forge
astunparse                1.6.3                    pypi_

## Model

In [5]:
class NiHClassifier(tf.keras.Model):
    def __init__(self, number_of_output_classes: int,
                 image_shape: Tuple[int, int, int] = (224, 224, 3)):
        super().__init__()
        self.number_of_output_classes: int = number_of_output_classes
        self.image_shape: Tuple[int, int, int] = image_shape

        self.pretrained_resnet50 = tf.keras.applications.resnet50.ResNet50(include_top=False,
                                                                           weights="imagenet",
                                                                           input_shape=image_shape)
        self.pretrained_resnet50.trainable = False

        self.global_average_pooling = tf.keras.layers.GlobalAveragePooling2D()
        self.prediction_layer = tf.keras.Sequential([
            tf.keras.layers.Dense(256, activation='relu'),
            tf.keras.layers.Dense(self.number_of_output_classes, activation=tf.keras.activations.sigmoid)
        ])

        self.build(input_shape=(None, image_shape[0], image_shape[1], image_shape[2]))

    def unfreeze_top_layers(self, fine_tune_top_n: int):
        self.pretrained_resnet50.trainable = True

        number_of_layers: int = len(self.pretrained_resnet50.layers)
        layers_to_freeze: int = number_of_layers - fine_tune_top_n

        for i in range(layers_to_freeze):
            self.pretrained_resnet50.layers[i].trainable = False

    def call(self, inputs, training=None, mask=None):
        resnet_features = self.pretrained_resnet50(inputs, training=training)
        avg_pooling_features = self.global_average_pooling(resnet_features)
        predictions = self.prediction_layer(avg_pooling_features)
        return predictions


## Google cloud storage interaction

In [14]:
def open_bucket(BUCKET_NAME):
    storage_client = storage.Client()
    bucket = storage_client.get_bucket(BUCKET_NAME)
    return bucket

def open_file_gcs(path_to_file: str, bucket, mode: str):
    blob = bucket.blob(path_to_file)
    return blob.open(mode)

## Helper methods

In [6]:
def read_image_data_augmentation(file_path, label):
    image = tf.io.read_file(file_path)
    image = tf.image.decode_image(image, channels=3, dtype=tf.float32)
    if tf.random.uniform(shape=[]) > 0.5:
        image = tf.image.flip_left_right(image)
    return image, label


def read_image(file_path, label):
    image = tf.io.read_file(file_path)
    image = tf.image.decode_image(image, channels=3, dtype=tf.float32)
    return image, label


def scheduler(epoch: int, lr: float) -> float:
    if epoch < 10:
        return lr
    else:
        return lr*tf.math.exp(-0.1)


def load_dataset(path_to_images: str,
                 path_to_pkl: str,
                 model_type: str,
                 batch_size: int,
                 data_augmentation: bool = False) -> tf.data.Dataset:
    df = pd.read_pickle(path_to_pkl)
    df['Image Index'] = path_to_images + df['Image Index']

    x = df['Image Index'].values
    if model_type == "Binary":
        y = df['No Finding'].values
    else:
        y = np.stack(df['multi_category_labels'].values)

    if data_augmentation:
        dataset = tf.data.Dataset.from_tensor_slices((x, y)).map(read_image_data_augmentation).batch(batch_size=batch_size)
    else:
        dataset = tf.data.Dataset.from_tensor_slices((x, y)).map(read_image).batch(batch_size=batch_size)
    return dataset


def save_history(history, path: str):
    if not os.path.exists(os.path.dirname(path)):
        os.makedirs(os.path.dirname(path))

    history_dict = history.history
    # Save it under the form of a json file
    s = str(history_dict)
    with open(path, 'w') as file:
        file.write(s)


def plot_loss_acc(history, path: str, model_type: str):
    if not os.path.exists(os.path.dirname(path)):
        os.makedirs(os.path.dirname(path))

    fig = plt.figure(figsize=(8, 8))

    if model_type == 'Binary':
        train_acc = history.history['accuracy']
        val_acc = history.history['val_accuracy']

        plt.subplot(2, 1, 1)
        plt.plot(train_acc, label='Training Accuracy')
        plt.plot(val_acc, label='Validation Accuracy')
        plt.legend(loc='lower right')
        plt.ylabel('Accuracy')
        plt.ylim([min(plt.ylim()), 1])
        plt.title('Training and Validation Accuracy')
    else:
        train_auc = history.history['auc']
        val_auc = history.history['val_auc']

        plt.subplot(2, 1, 1)
        plt.plot(train_auc, label='Training AUC')
        plt.plot(val_auc, label='Validation AUC')
        plt.legend(loc='lower right')
        plt.ylabel('AUC')
        plt.ylim([min(plt.ylim()), 1])
        plt.title('Training and Validation AUC')

    train_loss = history.history['loss']
    val_loss = history.history['val_loss']

    plt.subplot(2, 1, 2)
    plt.plot(train_loss, label='Training Loss')
    plt.plot(val_loss, label='Validation Loss')
    plt.legend(loc='upper right')
    plt.ylabel('Cross Entropy')
    plt.ylim([0, 1.0])
    plt.title('Training and Validation Loss')
    plt.xlabel('epoch')
    plt.tight_layout()
    plt.show()

## Load data

In [7]:
BUCKET_NAME = 'dldsproject'

path_to_training_images: str = 'gs://dldsproject/NiH/images_resized/train/'
path_to_validation_images: str = 'gs://dldsproject/NiH/images_resized/val/'
path_to_test_images: str = 'gs://dldsproject/NiH/images_resized/test/'
path_to_training_pkl: str = 'gs://dldsproject/NiH/training_data_extended_4.pkl'
path_to_validation_pkl: str = 'gs://dldsproject/NiH/validation_data_extended_4.pkl'
path_to_test_pkl: str = 'gs://dldsproject/NiH/test_data_extended_4.pkl'

batch_size: int = 64

In [8]:
binary_train_ds = load_dataset(path_to_training_images,
                               path_to_training_pkl,
                               model_type='Binary',
                               batch_size=batch_size,
                               data_augmentation=False)
binary_val_ds = load_dataset(path_to_training_images,
                             path_to_training_pkl,
                             model_type='Binary',
                             batch_size=batch_size,
                             data_augmentation=False)
binary_test_ds = load_dataset(path_to_training_images,
                              path_to_training_pkl,
                              model_type='Binary',
                              batch_size=batch_size,
                              data_augmentation=False)

2023-05-20 15:25:25.426958: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-05-20 15:25:25.447056: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-05-20 15:25:25.450661: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2023-05-20 15:25:25.453305: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:980] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zer

## Train Model

In [9]:
def train(training_dataset,
          validation_dataset,
          initial_epochs: int,
          fine_tuning_epochs: int,
          lr: float,
          fine_tune_at: int,
          lr_scheduler: bool,
          model_type: str = 'Binary'):
    
    number_of_output_classes: int = 1 if model_type == 'Binary' else 15
    
    loss: tf.keras.losses
    metric: List[tf.keras.metrics]
    if model_type == 'Binary':
        loss = tf.keras.losses.BinaryCrossentropy()
        metric = ['accuracy']
    else:
        loss = tf.keras.losses.CategoricalCrossentropy()
        metric = [tf.keras.metrics.AUC()]

    callbacks = []
    if lr_scheduler:
        callback_scheduler = tf.keras.callbacks.LearningRateScheduler(scheduler)
        callbacks.append(callback_scheduler)
        
    # monitor: str = 'val_accuracy' if model_type == 'Binary' else 'val_auc'
    # model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    #     filepath=os.path.join(path_to_save_models, 'checkpoints'),
    #     save_weights_only=True,
    #     monitor=monitor,
    #     mode='max',
    #     save_best_only=True)
    # callbacks.append(model_checkpoint_callback)
    
    nih_classifier: NiHClassifier = NiHClassifier(number_of_output_classes=number_of_output_classes)
    nih_classifier.compile(optimizer=tf.keras.optimizers.Adam(lr),
                           loss=loss,
                           metrics=metric)

    training_history = nih_classifier.fit(training_dataset,
                                          epochs=initial_epochs,
                                          validation_data=validation_dataset,
                                          callbacks=callbacks)

In [10]:
initial_epochs: int = 10
fine_tuning_epochs: int = 10
lr: float = 1e-4
fine_tune_at: int = 45
lr_scheduler: bool = False

In [11]:
train(binary_train_ds,
      binary_val_ds,
      initial_epochs=initial_epochs,
      fine_tuning_epochs=fine_tuning_epochs,
      lr=lr,
      fine_tune_at=fine_tune_at,
      lr_scheduler=lr_scheduler,
      model_type='Binary')

Epoch 1/10


2023-05-20 15:26:08.225279: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8200
2023-05-20 15:26:17.335552: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2023-05-20 15:26:17.336364: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2023-05-20 15:26:17.336435: W tensorflow/stream_executor/gpu/asm_compiler.cc:80] Couldn't get ptxas version string: INTERNAL: Couldn't invoke ptxas --version
2023-05-20 15:26:17.337366: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2023-05-20 15:26:17.337495: W tensorflow/stream_executor/gpu/redzone_allocator.cc:314] INTERNAL: Failed to launch ptxas
Relying on driver to perform ptx compilation. 
Modify $PATH to customize ptxas location.
This message will be only logged once.


   9/1077 [..............................] - ETA: 1:56:01 - loss: 0.7648 - accuracy: 0.4653

KeyboardInterrupt: 