In [1]:
!pip install tensorflow_addons

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com


In [2]:
import sys
sys.path.append('/home/jovyan/ChestXray-14')

In [3]:
import tensorflow as tf

from modules.models import Model_with_dropout
from modules.utils import get_dataset

from tensorflow.keras.callbacks import ModelCheckpoint, CSVLogger, EarlyStopping, ReduceLROnPlateau, LearningRateScheduler, TensorBoard

2023-04-05 04:58:22.151726: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [4]:
from modules.dataset import LABELS

In [5]:
ROOT_PATH = "/home/jovyan/ChestXray-14"
INPUT_PATH = f"{ROOT_PATH}/dataset/ChestXray NIH"
EXPERIMENT_NAME = "under_sampling_5_folds_with_cross_entropy_loss_freeze_imagenet"

In [6]:
class Dataset:
    INPUT_PATH = INPUT_PATH
    
    def __init__(self, fold_num):
        self.fold_num = fold_num
    
    def get_train(self):
        filenames = tf.io.gfile.glob(f'{self.INPUT_PATH}/data/under_sampling_5_folds_dataset/folds/fold{self.fold_num}/train/*.tfrec')
        dataset = get_dataset(filenames)
        return dataset

    def get_test(self):
        filenames = tf.io.gfile.glob(f'{self.INPUT_PATH}/data/under_sampling_5_folds_dataset/folds/fold{self.fold_num}/test/*.tfrec')
        dataset = get_dataset(filenames)
        return dataset

## tf.keras.mixed_precision for faster training

In [7]:
print(tf.keras.mixed_precision.global_policy())
# tf.keras.mixed_precision.set_global_policy('mixed_float16')
print(tf.keras.mixed_precision.global_policy())

<Policy "float32">
<Policy "float32">


## Model Training

In [8]:
import os
CURRENT_PATH = os.path.abspath("")
CURRENT_PATH

'/home/jovyan/ChestXray-14/experiments/Under_sampling-multilabel_classification'

In [9]:
# Learning rate
def lr_schedule(epoch, learning_rate):
    tf.summary.scalar('learning rate', data=learning_rate, step=epoch)
    return learning_rate

In [10]:
import datetime

def get_callbacks(NAME, weight_option, fold_num):
    log_dir = f"{CURRENT_PATH}/logs/{EXPERIMENT_NAME}/{NAME}_{weight_option}_FOLD_{fold_num}_" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)
    
    model_checkpoint_callback = ModelCheckpoint(f'results/models/{EXPERIMENT_NAME}/{NAME}_{weight_option}_FOLD_{fold_num}.h5', monitor='val_loss', mode='min', save_best_only=True)
    early_stop_callback = EarlyStopping(monitor='val_loss', mode="min", patience=20, verbose=1)
    reduce_lr_callback = ReduceLROnPlateau(monitor='val_loss', mode="min", factor=0.5, patience=3, verbose=1)
    lr_logging_callback = LearningRateScheduler(lr_schedule)
    
    return model_checkpoint_callback, early_stop_callback, reduce_lr_callback, lr_logging_callback, tensorboard_callback

In [11]:
# Constant variables
NAME = "EfficientNetB0"
EPOCHS = 100

In [12]:
import pprint
import numpy as np

def check_label_on_dataset(train_dataset, test_dataset):
    y_list = []
    for x,y in train_dataset:
        # print(y)
        y_list.append(y)
    train_labels = np.vstack(y_list)

    y_list = []
    for x,y in test_dataset:
        # print(y)
        y_list.append(y)
    test_labels = np.vstack(y_list)

    # Print
    pprint.pprint(list(zip(LABELS,train_labels.sum(axis=0))))
    print("Count:", len(train_labels))
    pprint.pprint(list(zip(LABELS, test_labels.sum(axis=0))))
    print("Count: ", len(test_labels))

    print("All:", len(train_labels)+len(test_labels))

In [13]:
# TODO: Main
import time
from tqdm.notebook import tqdm

NUM_FOLDS = 5
weight_option = 'imagenet' # use `imagenet` or `None` only

# Fold [1, 3]
for fold_num in tqdm([1, 3]):
    
    # Callbacks
    (
        model_checkpoint_callback, 
        early_stop_callback, 
        reduce_lr_callback, 
        lr_logging_callback, 
        tensorboard_callback
    ) = get_callbacks(
        NAME, weight_option, fold_num
    )

    # Path for CSV
    path = os.path.join(CURRENT_PATH, "results", "history", EXPERIMENT_NAME, f"{NAME}_{weight_option}_FOLD_{fold_num}")
    os.makedirs(path, exist_ok=True)

    # CSV Logger
    csv_logger = CSVLogger(os.path.join(path, f"history.csv"))

    # Dataset
    dataset = Dataset(fold_num)
    train_dataset, test_dataset = dataset.get_train(), dataset.get_test()
    check_label_on_dataset(train_dataset, test_dataset)

    # Modeling
    transfer_model = tf.keras.applications.efficientnet.EfficientNetB0(
        include_top=False, 
        weights=weight_option,
        input_shape=(224, 224, 3),
        pooling=None
    )
    transfer_model.trainable = False

    model = Model_with_dropout(
        transfer_model,
    )
    model = model.get_model()
    model._name = f"{EXPERIMENT_NAME}_FOLD_{fold_num}"
    model.summary()

    # Record time for training
    start = time.time()
    
    # Visualize
    history = model.fit(
        train_dataset,
        epochs=EPOCHS,
        validation_data=test_dataset,
        verbose=1, # Show Progress Bar while Traning
        callbacks=[model_checkpoint_callback, csv_logger, early_stop_callback, reduce_lr_callback, lr_logging_callback]
    )
    
    end = time.time()
    print(
        "===== " * 5,
        "Model Traninig for: {:.2f} second(s)".format(end - start),
        "===== " * 5,
        "\n\n"
    )
    
    break

  0%|          | 0/2 [00:00<?, ?it/s]

2023-04-05 04:58:25.306914: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-05 04:58:26.060776: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 8011 MB memory:  -> device: 0, name: NVIDIA A100-SXM4-40GB MIG 2g.10gb, pci bus id: 0000:31:00.0, compute capability: 8.0


[('No Finding', 7560),
 ('Atelectasis', 9334),
 ('Consolidation', 3773),
 ('Infiltration', 15943),
 ('Pneumothorax', 4256),
 ('Edema', 1857),
 ('Emphysema', 2029),
 ('Fibrosis', 1367),
 ('Effusion', 10692),
 ('Pneumonia', 1137),
 ('Pleural_Thickening', 2722),
 ('Cardiomegaly', 2219),
 ('Nodule', 5046),
 ('Mass', 4641),
 ('Hernia', 181)]
Count: 49055
[('No Finding', 12160),
 ('Atelectasis', 2225),
 ('Consolidation', 894),
 ('Infiltration', 3951),
 ('Pneumothorax', 1046),
 ('Edema', 446),
 ('Emphysema', 487),
 ('Fibrosis', 319),
 ('Effusion', 2625),
 ('Pneumonia', 294),
 ('Pleural_Thickening', 663),
 ('Cardiomegaly', 557),
 ('Nodule', 1285),
 ('Mass', 1141),
 ('Hernia', 46)]
Count:  22424
All: 71479
Model: "under_sampling_5_folds_with_cross_entropy_loss_freeze_imagenet_FOLD_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 efficientnetb0 (Functional)  (None, 7, 7, 1280)       4049571   
                

2023-04-05 04:58:58.755565: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8100
2023-04-05 04:58:59.660595: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2023-04-05 04:58:59.661815: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2023-04-05 04:58:59.661853: W tensorflow/stream_executor/gpu/asm_compiler.cc:80] Couldn't get ptxas version string: INTERNAL: Couldn't invoke ptxas --version
2023-04-05 04:58:59.662655: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2023-04-05 04:58:59.662712: W tensorflow/stream_executor/gpu/redzone_allocator.cc:314] INTERNAL: Failed to launch ptxas
Relying on driver to perform ptx compilation. 
Modify $PATH to customize ptxas location.
This message will be only logged once.
2023-04-05 04:59:00.747469: I tensorflow/stream_executor/cuda/c

Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 8: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 12: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 16: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 20: ReduceLROnPlateau reducing learning rate to 6.25000029685907e-05.
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 24: ReduceLROnPlateau reducing learning rate to 3.125000148429535e-05.
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 28: ReduceLROnPlateau reducing learning rate to 1.5625000742147677e-05.
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 32: ReduceLROnPlateau reducing learning rate to 7.812500371073838e-06.
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Ep

In [15]:
print("Done")

Done
