In [1]:
import os
ROOT_PATH = os.path.abspath("../..") # "/home/jovyan/ChestXray-14"

In [2]:
import sys
sys.path.append(ROOT_PATH)

In [3]:
import tensorflow as tf

from modules.utils import get_dataset
from tensorflow.keras.callbacks import ModelCheckpoint, CSVLogger, EarlyStopping, ReduceLROnPlateau, LearningRateScheduler, TensorBoard

2023-04-08 14:18:04.797665: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [4]:
from modules.dataset import LABELS

ROOT_PATH: /home/jovyan/ChestXray-14/experiments/Under_sampling-multilabel_classification


In [5]:
ROOT_PATH = os.path.abspath("../..") # "/home/jovyan/ChestXray-14"
INPUT_PATH = f"{ROOT_PATH}/dataset/ChestXray NIH"
EXPERIMENT_NAME = "under_sampling_5_folds_with_cross_entropy_loss_with_model_under_sampling_dropout"

In [6]:
class Dataset:
    INPUT_PATH = INPUT_PATH
    
    def __init__(self, fold_num):
        self.fold_num = fold_num
        self.DATA_PATH = "under_sampling_5_folds_dataset_train_valid_test"
    
    def get_train(self):
        filenames = tf.io.gfile.glob(f'{self.INPUT_PATH}/data/{self.DATA_PATH}/folds/fold{self.fold_num}/train/*.tfrec')
        dataset = get_dataset(filenames)
        return dataset

    def get_valid(self):
        filenames = tf.io.gfile.glob(f'{self.INPUT_PATH}/data/{self.DATA_PATH}/folds/fold{self.fold_num}/valid/*.tfrec')
        dataset = get_dataset(filenames)
        return dataset
    
    def get_test(self):
        filenames = tf.io.gfile.glob(f'{self.INPUT_PATH}/data/{self.DATA_PATH}/test/*.tfrec')
        dataset = get_dataset(filenames)
        return dataset

## tf.keras.mixed_precision for faster training

In [7]:
print(tf.keras.mixed_precision.global_policy())
tf.keras.mixed_precision.set_global_policy('mixed_float16')
print(tf.keras.mixed_precision.global_policy())

<Policy "float32">
INFO:tensorflow:Mixed precision compatibility check (mixed_float16): OK
Your GPU will likely run quickly with dtype policy mixed_float16 as it has compute capability of at least 7.0. Your GPU: NVIDIA A100-SXM4-40GB MIG 2g.10gb, compute capability 8.0
<Policy "mixed_float16">


## Model Training

In [8]:
CURRENT_PATH = os.path.abspath("")
CURRENT_PATH

'/home/jovyan/ChestXray-14/experiments/Under_sampling-multilabel_classification'

In [9]:
# Learning rate
def lr_schedule(epoch, learning_rate):
    tf.summary.scalar('learning rate', data=learning_rate, step=epoch)
    return learning_rate

In [10]:
import datetime

def get_callbacks(NAME, weight_option, fold_num):
    log_dir = f"{CURRENT_PATH}/logs/{EXPERIMENT_NAME}/{NAME}_{weight_option}_FOLD_{fold_num}_" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)
    
    model_checkpoint_callback = ModelCheckpoint(f'results/models/{EXPERIMENT_NAME}/{NAME}_{weight_option}_FOLD_{fold_num}.h5', monitor='val_loss', mode='min', save_best_only=True)
    early_stop_callback = EarlyStopping(monitor='val_loss', mode="min", patience=20, verbose=1)
    reduce_lr_callback = ReduceLROnPlateau(monitor='val_loss', mode="min", factor=0.5, patience=3, verbose=1)
    lr_logging_callback = LearningRateScheduler(lr_schedule)
    
    return model_checkpoint_callback, early_stop_callback, reduce_lr_callback, lr_logging_callback, tensorboard_callback

In [11]:
# Constant variables
NAME = "EfficientNetB0"
EPOCHS = 100
weight_option = None # use `imagenet` or `None` only

In [12]:
import pprint
import numpy as np

def check_label_on_dataset(train_dataset, test_dataset):
    y_list = []
    for x,y in train_dataset:
        # print(y)
        y_list.append(y)
    train_labels = np.vstack(y_list)

    y_list = []
    for x,y in test_dataset:
        # print(y)
        y_list.append(y)
    test_labels = np.vstack(y_list)

    # Print
    pprint.pprint(list(zip(LABELS,train_labels.sum(axis=0))))
    print("Count:", len(train_labels))
    pprint.pprint(list(zip(LABELS, test_labels.sum(axis=0))))
    print("Count: ", len(test_labels))

    print("All:", len(train_labels)+len(test_labels))

In [13]:
from modules.models import Model_under_sampling_dropout

In [14]:
# TODO: Main
import time
from tqdm.notebook import tqdm

NUM_FOLDS = 5
for fold_num in tqdm(range(1, NUM_FOLDS + 1)):
    
    # Callbacks
    (
        model_checkpoint_callback, 
        early_stop_callback, 
        reduce_lr_callback, 
        lr_logging_callback, 
        tensorboard_callback
    ) = get_callbacks(
        NAME, weight_option, fold_num
    )

    # Path for CSV
    path = os.path.join(CURRENT_PATH, "results", "history", EXPERIMENT_NAME, f"{NAME}_{weight_option}_FOLD_{fold_num}")
    os.makedirs(path, exist_ok=True)

    # CSV Logger
    csv_logger = CSVLogger(os.path.join(path, f"history.csv"))

    # Dataset
    dataset = Dataset(fold_num)
    train_dataset, validation_dataset = dataset.get_train(), dataset.get_valid()
    check_label_on_dataset(train_dataset, validation_dataset)

    # Modeling
    transfer_model = tf.keras.applications.efficientnet.EfficientNetB0(
        include_top=False, 
        weights=weight_option,
        input_shape=(224, 224, 3),
        pooling=None
    )

    model = Model_under_sampling_dropout(
        transfer_model,
    )
    model = model.get_model()
    model._name = f"{EXPERIMENT_NAME}_FOLD_{fold_num}"
    model.summary()

    # Record time for training
    start = time.time()
    
    # Visualize
    history = model.fit(
        train_dataset,
        epochs=EPOCHS,
        validation_data=validation_dataset,
        verbose=1, # Show Progress Bar while Traning
        callbacks=[model_checkpoint_callback, csv_logger, early_stop_callback, reduce_lr_callback, lr_logging_callback]
    )
    
    end = time.time()
    print(
        "===== " * 5,
        "Model Traninig for: {:.2f} second(s)".format(end - start),
        "===== " * 5,
        "\n\n"
    )

  0%|          | 0/5 [00:00<?, ?it/s]

2023-04-08 14:18:07.965079: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-08 14:18:08.714702: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 8011 MB memory:  -> device: 0, name: NVIDIA A100-SXM4-40GB MIG 2g.10gb, pci bus id: 0000:31:00.0, compute capability: 8.0


[('No Finding', 6057),
 ('Atelectasis', 7541),
 ('Consolidation', 2994),
 ('Infiltration', 12735),
 ('Pneumothorax', 3407),
 ('Edema', 1471),
 ('Emphysema', 1607),
 ('Fibrosis', 1099),
 ('Effusion', 8531),
 ('Pneumonia', 909),
 ('Pleural_Thickening', 2190),
 ('Cardiomegaly', 1789),
 ('Nodule', 4053),
 ('Mass', 3750),
 ('Hernia', 139)]
Count: 39244
[('No Finding', 1503),
 ('Atelectasis', 1793),
 ('Consolidation', 779),
 ('Infiltration', 3208),
 ('Pneumothorax', 849),
 ('Edema', 386),
 ('Emphysema', 422),
 ('Fibrosis', 268),
 ('Effusion', 2161),
 ('Pneumonia', 228),
 ('Pleural_Thickening', 532),
 ('Cardiomegaly', 430),
 ('Nodule', 993),
 ('Mass', 891),
 ('Hernia', 42)]
Count:  9811
All: 49055
Model: "under_sampling_5_folds_with_cross_entropy_loss_with_model_under_sampling_dropout_FOLD_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 efficientnetb0 (Functional)  (None, 7, 7, 1280)       4049571   
     

2023-04-08 14:18:39.489210: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8100
2023-04-08 14:18:40.398749: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2023-04-08 14:18:40.400184: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2023-04-08 14:18:40.400240: W tensorflow/stream_executor/gpu/asm_compiler.cc:80] Couldn't get ptxas version string: INTERNAL: Couldn't invoke ptxas --version
2023-04-08 14:18:40.401517: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2023-04-08 14:18:40.401636: W tensorflow/stream_executor/gpu/redzone_allocator.cc:314] INTERNAL: Failed to launch ptxas
Relying on driver to perform ptx compilation. 
Modify $PATH to customize ptxas location.
This message will be only logged once.


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 4: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 8: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 11: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 14: ReduceLROnPlateau reducing learning rate to 6.25000029685907e-05.
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 17: ReduceLROnPlateau reducing learning rate to 3.125000148429535e-05.
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 20: ReduceLROnPlateau reducing learning rate to 1.5625000742147677e-05.
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 23: ReduceLROnPlateau reducing learning rate to 7.812500371073838e-06.
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 26: ReduceLROnPlateau reducing learning rate to 3.906250185536919e-06.
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 29: ReduceLROnPlateau re

InvalidArgumentError: Graph execution error:

Detected at node 'assert_greater_equal/Assert/AssertGuard/Assert' defined at (most recent call last):
    File "/opt/conda/lib/python3.10/runpy.py", line 196, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "/opt/conda/lib/python3.10/runpy.py", line 86, in _run_code
      exec(code, run_globals)
    File "/opt/conda/lib/python3.10/site-packages/ipykernel_launcher.py", line 17, in <module>
      app.launch_new_instance()
    File "/opt/conda/lib/python3.10/site-packages/traitlets/config/application.py", line 976, in launch_instance
      app.start()
    File "/opt/conda/lib/python3.10/site-packages/ipykernel/kernelapp.py", line 712, in start
      self.io_loop.start()
    File "/opt/conda/lib/python3.10/site-packages/tornado/platform/asyncio.py", line 199, in start
      self.asyncio_loop.run_forever()
    File "/opt/conda/lib/python3.10/asyncio/base_events.py", line 600, in run_forever
      self._run_once()
    File "/opt/conda/lib/python3.10/asyncio/base_events.py", line 1896, in _run_once
      handle._run()
    File "/opt/conda/lib/python3.10/asyncio/events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "/opt/conda/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 510, in dispatch_queue
      await self.process_one()
    File "/opt/conda/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 499, in process_one
      await dispatch(*args)
    File "/opt/conda/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 406, in dispatch_shell
      await result
    File "/opt/conda/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 730, in execute_request
      reply_content = await reply_content
    File "/opt/conda/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 383, in do_execute
      res = shell.run_cell(
    File "/opt/conda/lib/python3.10/site-packages/ipykernel/zmqshell.py", line 528, in run_cell
      return super().run_cell(*args, **kwargs)
    File "/opt/conda/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 2881, in run_cell
      result = self._run_cell(
    File "/opt/conda/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 2936, in _run_cell
      return runner(coro)
    File "/opt/conda/lib/python3.10/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "/opt/conda/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3135, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "/opt/conda/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3338, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "/opt/conda/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3398, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "/tmp/ipykernel_354893/2309012708.py", line 50, in <cell line: 6>
      history = model.fit(
    File "/opt/conda/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "/opt/conda/lib/python3.10/site-packages/keras/engine/training.py", line 1445, in fit
      val_logs = self.evaluate(
    File "/opt/conda/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "/opt/conda/lib/python3.10/site-packages/keras/engine/training.py", line 1756, in evaluate
      tmp_logs = self.test_function(iterator)
    File "/opt/conda/lib/python3.10/site-packages/keras/engine/training.py", line 1557, in test_function
      return step_function(self, iterator)
    File "/opt/conda/lib/python3.10/site-packages/keras/engine/training.py", line 1546, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/opt/conda/lib/python3.10/site-packages/keras/engine/training.py", line 1535, in run_step
      outputs = model.test_step(data)
    File "/opt/conda/lib/python3.10/site-packages/keras/engine/training.py", line 1502, in test_step
      return self.compute_metrics(x, y, y_pred, sample_weight)
    File "/opt/conda/lib/python3.10/site-packages/keras/engine/training.py", line 987, in compute_metrics
      self.compiled_metrics.update_state(y, y_pred, sample_weight)
    File "/opt/conda/lib/python3.10/site-packages/keras/engine/compile_utils.py", line 501, in update_state
      metric_obj.update_state(y_t, y_p, sample_weight=mask)
    File "/opt/conda/lib/python3.10/site-packages/keras/utils/metrics_utils.py", line 70, in decorated
      update_op = update_state_fn(*args, **kwargs)
    File "/opt/conda/lib/python3.10/site-packages/keras/metrics/base_metric.py", line 140, in update_state_fn
      return ag_update_state(*args, **kwargs)
    File "/opt/conda/lib/python3.10/site-packages/keras/metrics/metrics.py", line 1759, in update_state
      return metrics_utils.update_confusion_matrix_variables(
    File "/opt/conda/lib/python3.10/site-packages/keras/utils/metrics_utils.py", line 602, in update_confusion_matrix_variables
      tf.debugging.assert_greater_equal(
Node: 'assert_greater_equal/Assert/AssertGuard/Assert'
Detected at node 'assert_greater_equal/Assert/AssertGuard/Assert' defined at (most recent call last):
    File "/opt/conda/lib/python3.10/runpy.py", line 196, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "/opt/conda/lib/python3.10/runpy.py", line 86, in _run_code
      exec(code, run_globals)
    File "/opt/conda/lib/python3.10/site-packages/ipykernel_launcher.py", line 17, in <module>
      app.launch_new_instance()
    File "/opt/conda/lib/python3.10/site-packages/traitlets/config/application.py", line 976, in launch_instance
      app.start()
    File "/opt/conda/lib/python3.10/site-packages/ipykernel/kernelapp.py", line 712, in start
      self.io_loop.start()
    File "/opt/conda/lib/python3.10/site-packages/tornado/platform/asyncio.py", line 199, in start
      self.asyncio_loop.run_forever()
    File "/opt/conda/lib/python3.10/asyncio/base_events.py", line 600, in run_forever
      self._run_once()
    File "/opt/conda/lib/python3.10/asyncio/base_events.py", line 1896, in _run_once
      handle._run()
    File "/opt/conda/lib/python3.10/asyncio/events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "/opt/conda/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 510, in dispatch_queue
      await self.process_one()
    File "/opt/conda/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 499, in process_one
      await dispatch(*args)
    File "/opt/conda/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 406, in dispatch_shell
      await result
    File "/opt/conda/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 730, in execute_request
      reply_content = await reply_content
    File "/opt/conda/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 383, in do_execute
      res = shell.run_cell(
    File "/opt/conda/lib/python3.10/site-packages/ipykernel/zmqshell.py", line 528, in run_cell
      return super().run_cell(*args, **kwargs)
    File "/opt/conda/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 2881, in run_cell
      result = self._run_cell(
    File "/opt/conda/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 2936, in _run_cell
      return runner(coro)
    File "/opt/conda/lib/python3.10/site-packages/IPython/core/async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "/opt/conda/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3135, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "/opt/conda/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3338, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "/opt/conda/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3398, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "/tmp/ipykernel_354893/2309012708.py", line 50, in <cell line: 6>
      history = model.fit(
    File "/opt/conda/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "/opt/conda/lib/python3.10/site-packages/keras/engine/training.py", line 1445, in fit
      val_logs = self.evaluate(
    File "/opt/conda/lib/python3.10/site-packages/keras/utils/traceback_utils.py", line 64, in error_handler
      return fn(*args, **kwargs)
    File "/opt/conda/lib/python3.10/site-packages/keras/engine/training.py", line 1756, in evaluate
      tmp_logs = self.test_function(iterator)
    File "/opt/conda/lib/python3.10/site-packages/keras/engine/training.py", line 1557, in test_function
      return step_function(self, iterator)
    File "/opt/conda/lib/python3.10/site-packages/keras/engine/training.py", line 1546, in step_function
      outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/opt/conda/lib/python3.10/site-packages/keras/engine/training.py", line 1535, in run_step
      outputs = model.test_step(data)
    File "/opt/conda/lib/python3.10/site-packages/keras/engine/training.py", line 1502, in test_step
      return self.compute_metrics(x, y, y_pred, sample_weight)
    File "/opt/conda/lib/python3.10/site-packages/keras/engine/training.py", line 987, in compute_metrics
      self.compiled_metrics.update_state(y, y_pred, sample_weight)
    File "/opt/conda/lib/python3.10/site-packages/keras/engine/compile_utils.py", line 501, in update_state
      metric_obj.update_state(y_t, y_p, sample_weight=mask)
    File "/opt/conda/lib/python3.10/site-packages/keras/utils/metrics_utils.py", line 70, in decorated
      update_op = update_state_fn(*args, **kwargs)
    File "/opt/conda/lib/python3.10/site-packages/keras/metrics/base_metric.py", line 140, in update_state_fn
      return ag_update_state(*args, **kwargs)
    File "/opt/conda/lib/python3.10/site-packages/keras/metrics/metrics.py", line 1759, in update_state
      return metrics_utils.update_confusion_matrix_variables(
    File "/opt/conda/lib/python3.10/site-packages/keras/utils/metrics_utils.py", line 602, in update_confusion_matrix_variables
      tf.debugging.assert_greater_equal(
Node: 'assert_greater_equal/Assert/AssertGuard/Assert'
2 root error(s) found.
  (0) INVALID_ARGUMENT:  assertion failed: [predictions must be >= 0] [Condition x >= y did not hold element-wise:] [x (Cast_3:0) = ] [[0.157104492 0.219970703 0.0889282227...]...] [y (Cast_5/x:0) = ] [0]
	 [[{{node assert_greater_equal/Assert/AssertGuard/Assert}}]]
	 [[assert_greater_equal_1/Assert/AssertGuard/pivot_f/_23/_65]]
  (1) INVALID_ARGUMENT:  assertion failed: [predictions must be >= 0] [Condition x >= y did not hold element-wise:] [x (Cast_3:0) = ] [[0.157104492 0.219970703 0.0889282227...]...] [y (Cast_5/x:0) = ] [0]
	 [[{{node assert_greater_equal/Assert/AssertGuard/Assert}}]]
0 successful operations.
0 derived errors ignored. [Op:__inference_test_function_2742432]

In [None]:
print("Done")