In [1]:
import os
ROOT_PATH = os.path.abspath("../..") # "/home/jovyan/ChestXray-14"

In [2]:
import sys
sys.path.append(ROOT_PATH)

In [3]:
import tensorflow as tf

from modules.utils import get_dataset
from tensorflow.keras.callbacks import ModelCheckpoint, CSVLogger, EarlyStopping, ReduceLROnPlateau, LearningRateScheduler, TensorBoard

2023-04-09 14:01:59.278090: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [4]:
from modules.dataset import LABELS

ROOT_PATH: /home/jovyan/ChestXray-14/experiments/Under_sampling-multilabel_classification


In [5]:
ROOT_PATH = os.path.abspath("../..") # "/home/jovyan/ChestXray-14"
INPUT_PATH = f"{ROOT_PATH}/dataset/ChestXray NIH"
EXPERIMENT_NAME = "under_sampling_5_folds_with_cross_entropy_loss_with_model_under_sampling_dropout_debug"

In [6]:
class Dataset:
    INPUT_PATH = INPUT_PATH
    
    def __init__(self, fold_num):
        self.fold_num = fold_num
        self.DATA_PATH = "under_sampling_5_folds_dataset_train_valid_test"
    
    def get_train(self):
        filenames = tf.io.gfile.glob(f'{self.INPUT_PATH}/data/{self.DATA_PATH}/folds/fold{self.fold_num}/train/*.tfrec')
        dataset = get_dataset(filenames)
        return dataset

    def get_valid(self):
        filenames = tf.io.gfile.glob(f'{self.INPUT_PATH}/data/{self.DATA_PATH}/folds/fold{self.fold_num}/valid/*.tfrec')
        dataset = get_dataset(filenames)
        return dataset
    
    def get_test(self):
        filenames = tf.io.gfile.glob(f'{self.INPUT_PATH}/data/{self.DATA_PATH}/test/*.tfrec')
        dataset = get_dataset(filenames)
        return dataset

## tf.keras.mixed_precision for faster training

In [7]:
print(tf.keras.mixed_precision.global_policy())
tf.keras.mixed_precision.set_global_policy('mixed_float16')
print(tf.keras.mixed_precision.global_policy())

<Policy "float32">
INFO:tensorflow:Mixed precision compatibility check (mixed_float16): OK
Your GPU will likely run quickly with dtype policy mixed_float16 as it has compute capability of at least 7.0. Your GPU: NVIDIA A100-SXM4-40GB MIG 2g.10gb, compute capability 8.0
<Policy "mixed_float16">


## Model Training

In [8]:
CURRENT_PATH = os.path.abspath("")
CURRENT_PATH

'/home/jovyan/ChestXray-14/experiments/Under_sampling-multilabel_classification'

In [9]:
# Learning rate
def lr_schedule(epoch, learning_rate):
    tf.summary.scalar('learning rate', data=learning_rate, step=epoch)
    return learning_rate

In [10]:
import datetime

def get_callbacks(NAME, weight_option, fold_num):
    log_dir = f"{CURRENT_PATH}/logs/{EXPERIMENT_NAME}/{NAME}_{weight_option}_FOLD_{fold_num}_" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
    tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)
    
    model_checkpoint_callback = ModelCheckpoint(f'results/models/{EXPERIMENT_NAME}/{NAME}_{weight_option}_FOLD_{fold_num}.h5', monitor='val_loss', mode='min', save_best_only=True)
    early_stop_callback = EarlyStopping(monitor='val_loss', mode="min", patience=20, verbose=1) # Default: patience=20
    reduce_lr_callback = ReduceLROnPlateau(monitor='val_loss', mode="min", factor=0.5, patience=3, verbose=1)
    lr_logging_callback = LearningRateScheduler(lr_schedule)
    
    return model_checkpoint_callback, early_stop_callback, reduce_lr_callback, lr_logging_callback, tensorboard_callback

In [11]:
# Constant variables
NAME = "EfficientNetB0"
EPOCHS = 100
weight_option = None # use `imagenet` or `None` only

In [12]:
import pprint
import numpy as np

def check_label_on_dataset(train_dataset, test_dataset):
    y_list = []
    for x,y in train_dataset:
        # print(y)
        y_list.append(y)
    train_labels = np.vstack(y_list)

    y_list = []
    for x,y in test_dataset:
        # print(y)
        y_list.append(y)
    test_labels = np.vstack(y_list)

    # Print
    pprint.pprint(list(zip(LABELS,train_labels.sum(axis=0))))
    print("Count:", len(train_labels))
    pprint.pprint(list(zip(LABELS, test_labels.sum(axis=0))))
    print("Count: ", len(test_labels))

    print("All:", len(train_labels)+len(test_labels))

In [13]:
from modules.models import Model_under_sampling_dropout

## TODO: Start Debuging Here

In [14]:
# TODO: Main
import time
from tqdm.notebook import tqdm

NUM_FOLDS = 5
for fold_num in tqdm(range(4, NUM_FOLDS + 1)):
    
    # Callbacks
    (
        model_checkpoint_callback, 
        early_stop_callback, 
        reduce_lr_callback, 
        lr_logging_callback, 
        tensorboard_callback
    ) = get_callbacks(
        NAME, weight_option, fold_num
    )

    # Path for CSV
    path = os.path.join(CURRENT_PATH, "results", "history", EXPERIMENT_NAME, f"{NAME}_{weight_option}_FOLD_{fold_num}")
    os.makedirs(path, exist_ok=True)

    # CSV Logger
    csv_logger = CSVLogger(os.path.join(path, f"history.csv"))

    # Dataset
    dataset = Dataset(fold_num)
    train_dataset, validation_dataset = dataset.get_train(), dataset.get_valid()
    check_label_on_dataset(train_dataset, validation_dataset)

    # Modeling
    transfer_model = tf.keras.applications.efficientnet.EfficientNetB0(
        include_top=False, 
        weights=weight_option,
        input_shape=(224, 224, 3),
        pooling=None
    )

    model = Model_under_sampling_dropout(
        transfer_model,
    )
    model = model.get_model()
    model._name = f"{EXPERIMENT_NAME}_FOLD_{fold_num}"
    model.summary()

    # Record time for training
    start = time.time()
    
    # Visualize
    history = model.fit(
        train_dataset,
        epochs=EPOCHS,
        validation_data=validation_dataset,
        verbose=1, # Show Progress Bar while Traning
        callbacks=[model_checkpoint_callback, csv_logger, early_stop_callback, reduce_lr_callback, lr_logging_callback]
    )
    
    end = time.time()
    print(
        "===== " * 5,
        "Model Traninig for: {:.2f} second(s)".format(end - start),
        "===== " * 5,
        "\n\n"
    )

  0%|          | 0/2 [00:00<?, ?it/s]

2023-04-09 14:02:02.421550: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-09 14:02:03.159435: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 8011 MB memory:  -> device: 0, name: NVIDIA A100-SXM4-40GB MIG 2g.10gb, pci bus id: 0000:31:00.0, compute capability: 8.0


[('No Finding', 6063),
 ('Atelectasis', 7476),
 ('Consolidation', 2996),
 ('Infiltration', 12793),
 ('Pneumothorax', 3409),
 ('Edema', 1491),
 ('Emphysema', 1626),
 ('Fibrosis', 1087),
 ('Effusion', 8543),
 ('Pneumonia', 917),
 ('Pleural_Thickening', 2161),
 ('Cardiomegaly', 1769),
 ('Nodule', 4068),
 ('Mass', 3678),
 ('Hernia', 142)]
Count: 39244
[('No Finding', 1497),
 ('Atelectasis', 1858),
 ('Consolidation', 777),
 ('Infiltration', 3150),
 ('Pneumothorax', 847),
 ('Edema', 366),
 ('Emphysema', 403),
 ('Fibrosis', 280),
 ('Effusion', 2149),
 ('Pneumonia', 220),
 ('Pleural_Thickening', 561),
 ('Cardiomegaly', 450),
 ('Nodule', 978),
 ('Mass', 963),
 ('Hernia', 39)]
Count:  9811
All: 49055
Model: "under_sampling_5_folds_with_cross_entropy_loss_with_model_under_sampling_dropout_debug_FOLD_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 efficientnetb0 (Functional)  (None, 7, 7, 1280)       4049571   

2023-04-09 14:02:33.228455: I tensorflow/stream_executor/cuda/cuda_dnn.cc:384] Loaded cuDNN version 8100
2023-04-09 14:02:34.178873: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2023-04-09 14:02:34.179953: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2023-04-09 14:02:34.180008: W tensorflow/stream_executor/gpu/asm_compiler.cc:80] Couldn't get ptxas version string: INTERNAL: Couldn't invoke ptxas --version
2023-04-09 14:02:34.181097: I tensorflow/core/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2023-04-09 14:02:34.181219: W tensorflow/stream_executor/gpu/redzone_allocator.cc:314] INTERNAL: Failed to launch ptxas
Relying on driver to perform ptx compilation. 
Modify $PATH to customize ptxas location.
This message will be only logged once.


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 6: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 10: ReduceLROnPlateau reducing learning rate to 0.0002500000118743628.
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 13: ReduceLROnPlateau reducing learning rate to 0.0001250000059371814.
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 16: ReduceLROnPlateau reducing learning rate to 6.25000029685907e-05.
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 19: ReduceLROnPlateau reducing learning rate to 3.125000148429535e-05.
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 22: ReduceLROnPlateau reducing learning rate to 1.5625000742147677e-05.
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 25: ReduceLROnPlateau reducing learning rate to 7.812500371073838e-06.
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 28: ReduceLROnPlateau reducing learning rate to 3.906250185536919e-06.
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epo

In [15]:
print("Done")

Done
