In [None]:
%cd ~/ChestXray-14/

## Start

In [None]:
!pwd

In [2]:
# from google.colab import drive
# drive.mount('/content/drive')

ModuleNotFoundError: No module named 'google.colab'

In [4]:
!ls "/datasets/chonsawat-drive/KKU /Project/Dataset/ChestXray NIH/models"

ls: cannot access '/datasets/chonsawat-drive/KKU /Project/Dataset/ChestXray NIH/models': No such file or directory


In [7]:
"""
Chonsawat Path: input_path = "/content/drive/MyDrive/KKU /Project/Dataset/ChestXray NIH"
Deepnote Path: input_path = "/datasets/chonsawat-drive/KKU /Project/Dataset/ChestXray NIH"
Elab Path: input_path = "~/ChestXray-14/dataset/ChestXray NIH"
"""
input_path = "dataset/ChestXray NIH"

In [8]:
from sklearn.utils import shuffle
from tqdm.notebook import tqdm
import tensorflow as tf
import pandas as pd
import numpy as np
import os

In [28]:
STRATEGY = tf.distribute.get_strategy()    
BATCH_SIZE = 16
IMG_SIZE = 224
SEED = 42
    
print('Using tensorflow %s' % tf.__version__)

Using tensorflow 2.6.2


## 1. Preprocessing to Dataframe

In [10]:
init_df = pd.read_csv(f'{input_path}/Data_Entry_2017.csv', index_col='Image Index')

diseases = [
    'No Finding',
    'Atelectasis',
    'Consolidation',
    'Infiltration',
    'Pneumothorax',
    'Edema',
    'Emphysema',
    'Fibrosis',
    'Effusion',
    'Pneumonia',
    'Pleural_Thickening',
    'Cardiomegaly',
    'Nodule',
    'Mass',
    'Hernia']


filenames = []

for i in range(12):
    filenames.append(os.listdir(f'{input_path}/images_{i+1 :03d}/images'))


def _parse_findings(raw):
    raw = raw.split('|')
    parsed = np.zeros((len(diseases),), dtype=np.bool)
    for i in range(len(diseases)):
        if diseases[i] in raw:
            parsed[i] = 1
    return parsed


def _find_image_path(image_id):
    for i in range(12):
        if image_id in filenames[i]:
            path = os.path.join(f'{input_path}/images_{i+1 :03d}/images', image_id)
            return path


df = pd.DataFrame(
    columns=diseases, 
    data=[_parse_findings(x) for x in init_df['Finding Labels']],
    index=[_find_image_path(x) for x in tqdm(init_df.index, total=len(init_df))])

df.to_csv(f'{input_path}/preprocessed_data.csv')
display(df.head())

  0%|          | 0/112120 [00:00<?, ?it/s]

Unnamed: 0,No Finding,Atelectasis,Consolidation,Infiltration,Pneumothorax,Edema,Emphysema,Fibrosis,Effusion,Pneumonia,Pleural_Thickening,Cardiomegaly,Nodule,Mass,Hernia
dataset/ChestXray NIH/images_001/images/00000001_000.png,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False
dataset/ChestXray NIH/images_001/images/00000001_001.png,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False
dataset/ChestXray NIH/images_001/images/00000001_002.png,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False
dataset/ChestXray NIH/images_001/images/00000002_000.png,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False
dataset/ChestXray NIH/images_001/images/00000003_000.png,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True


## 2. Serialization

In [29]:
def _serialize_image(path):
    image = tf.io.read_file(path)
    image = tf.image.decode_png(image, channels=1)
    image = tf.image.resize(image, [IMG_SIZE, IMG_SIZE])
    image = tf.cast(image, tf.uint8)
    return tf.image.encode_jpeg(image).numpy()


def _serialize_sample(image_id, image, proba):
    feature = {
        'image': tf.train.Feature(bytes_list=tf.train.BytesList(value=[image])),
        'image_id': tf.train.Feature(bytes_list=tf.train.BytesList(value=[image_id])),
        'No Finding': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[0]])),
        'Atelectasis': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[1]])),
        'Consolidation': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[2]])),
        'Infiltration': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[3]])),
        'Pneumothorax': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[4]])),
        'Edema': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[5]])),
        'Emphysema': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[6]])),
        'Fibrosis': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[7]])),
        'Effusion': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[8]])),
        'Pneumonia': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[9]])),
        'Pleural_Thickening': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[10]])),
        'Cardiomegaly': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[11]])),
        'Nodule': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[12]])),
        'Mass': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[13]])),
        'Hernia': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[14]]))}
    sample = tf.train.Example(features=tf.train.Features(feature=feature))
    return sample.SerializeToString()
        
        
def serialize_fold(fold, name):
    samples = []
    
    for index, proba in fold.iterrows():
        samples.append(_serialize_sample(
            index.split('/')[-1].encode(), 
            _serialize_image(index), 
            proba))
    
    with tf.io.TFRecordWriter(name + '.tfrec') as writer:
        [writer.write(x) for x in samples]

In [12]:
df = pd.read_csv(f"{input_path}/preprocessed_data.csv", index_col=0)
df

Unnamed: 0,No Finding,Atelectasis,Consolidation,Infiltration,Pneumothorax,Edema,Emphysema,Fibrosis,Effusion,Pneumonia,Pleural_Thickening,Cardiomegaly,Nodule,Mass,Hernia
dataset/ChestXray NIH/images_001/images/00000001_000.png,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False
dataset/ChestXray NIH/images_001/images/00000001_001.png,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False
dataset/ChestXray NIH/images_001/images/00000001_002.png,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False
dataset/ChestXray NIH/images_001/images/00000002_000.png,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False
dataset/ChestXray NIH/images_001/images/00000003_000.png,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
dataset/ChestXray NIH/images_012/images/00030801_001.png,False,False,False,False,False,False,False,False,False,True,False,False,False,True,False
dataset/ChestXray NIH/images_012/images/00030802_000.png,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False
dataset/ChestXray NIH/images_012/images/00030803_000.png,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False
dataset/ChestXray NIH/images_012/images/00030804_000.png,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [30]:
try: serialize_list = os.listdir(f"{input_path}/data/") 
except: serialize_list = []

serialize_list[:5]

['000-438.tfrec',
 '001-438.tfrec',
 '002-438.tfrec',
 '003-438.tfrec',
 '004-438.tfrec']

In [36]:
df = shuffle(df, random_state=SEED)
folds = 256

os.mkdir(f'{input_path}/data')

try: serialize_list = os.listdir(f"{input_path}/data/") 
except: serialize_list = []

for i, fold in tqdm(enumerate(np.array_split(df, folds)), total=folds):
    # if f"{i:03d}-{len(fold):03d}.tfrec" not in serialize_list:
    #     serialize_fold(fold, name=f'{input_path}/data/{i:03d}-{len(fold):03d}')
    serialize_fold(fold, name=f'{input_path}/data/{i:03d}-{len(fold):03d}')

  0%|          | 0/256 [00:00<?, ?it/s]

  'No Finding': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[0]])),
  'Atelectasis': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[1]])),
  'Consolidation': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[2]])),
  'Infiltration': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[3]])),
  'Pneumothorax': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[4]])),
  'Edema': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[5]])),
  'Emphysema': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[6]])),
  'Fibrosis': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[7]])),
  'Effusion': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[8]])),
  'Pneumonia': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[9]])),
  'Pleural_Thickening': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[10]])),
  'Cardiomegaly': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[11]

## 3. Train placeholder

In [1]:
import tensorflow as tf

In [2]:
feature_map = {
    'image': tf.io.FixedLenFeature([], tf.string),
    'image_id': tf.io.FixedLenFeature([], tf.string),
    'No Finding': tf.io.FixedLenFeature([], tf.int64),
    'Atelectasis': tf.io.FixedLenFeature([], tf.int64),
    'Consolidation': tf.io.FixedLenFeature([], tf.int64),
    'Infiltration': tf.io.FixedLenFeature([], tf.int64),
    'Pneumothorax': tf.io.FixedLenFeature([], tf.int64),
    'Edema': tf.io.FixedLenFeature([], tf.int64),
    'Emphysema': tf.io.FixedLenFeature([], tf.int64),
    'Fibrosis': tf.io.FixedLenFeature([], tf.int64),
    'Effusion': tf.io.FixedLenFeature([], tf.int64),
    'Pneumonia': tf.io.FixedLenFeature([], tf.int64),
    'Pleural_Thickening': tf.io.FixedLenFeature([], tf.int64),
    'Cardiomegaly': tf.io.FixedLenFeature([], tf.int64),
    'Nodule': tf.io.FixedLenFeature([], tf.int64),
    'Mass': tf.io.FixedLenFeature([], tf.int64),
    'Hernia': tf.io.FixedLenFeature([], tf.int64)}


def count_data_items(filenames):
    return np.sum([int(x[:-6].split('-')[-1]) for x in filenames])


def decode_image(image_data):
    image = tf.image.decode_jpeg(image_data, channels=1)
    image = tf.reshape(image, [IMG_SIZE, IMG_SIZE, 1])
    return image


def scale_image(image, target):
    image = tf.cast(image, tf.float32) / 255.
    return image, target


def read_tfrecord(example):
    example = tf.io.parse_single_example(example, feature_map)
    image = decode_image(example['image'])
    target = [
        example['No Finding'],
        example['Atelectasis'],
        example['Consolidation'],
        example['Infiltration'],
        example['Pneumothorax'],
        example['Edema'],
        example['Emphysema'],
        example['Fibrosis'],
        example['Effusion'],
        example['Pneumonia'],
        example['Pleural_Thickening'],
        example['Cardiomegaly'],
        example['Nodule'],
        example['Mass'],
        example['Hernia']]
    return image, target


def data_augment(image, target):
    image = tf.image.random_flip_left_right(image, seed=SEED)
    image = tf.image.random_flip_up_down(image, seed=SEED)
    return image, target


def get_dataset(filenames, shuffled=False, repeated=False, 
                cached=False, augmented=False, distributed=True):
    auto = tf.data.experimental.AUTOTUNE
    dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads=auto)
    dataset = dataset.map(read_tfrecord, num_parallel_calls=auto)
    if augmented:
        dataset = dataset.map(data_augment, num_parallel_calls=auto)
    dataset = dataset.map(scale_image, num_parallel_calls=auto)
    if shuffled:
        dataset = dataset.shuffle(2048, seed=SEED)
    if repeated:
        dataset = dataset.repeat()
    dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)
    if cached:
        dataset = dataset.cache()
    dataset = dataset.prefetch(auto)
    if distributed:
        dataset = STRATEGY.experimental_distribute_dataset(dataset)
    return dataset


def get_model():
    model = tf.keras.models.Sequential([
        tf.keras.applications.EfficientNetB0(
            include_top=False,
            input_shape=(None, None, 1),
            weights=None,
            pooling='avg'),
        tf.keras.layers.Dense(15, activation='sigmoid')
    ])
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=tf.keras.metrics.AUC(multi_label=True))

    return model

In [39]:
tf.keras.applications.EfficientNetB0(
            include_top=False,
            input_shape=(None, None, 1),
            weights=None,
            pooling='avg').summary()

Model: "efficientnetb0"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_4 (InputLayer)            [(None, None, None,  0                                            
__________________________________________________________________________________________________
rescaling_3 (Rescaling)         (None, None, None, 1 0           input_4[0][0]                    
__________________________________________________________________________________________________
normalization_3 (Normalization) (None, None, None, 1 3           rescaling_3[0][0]                
__________________________________________________________________________________________________
stem_conv_pad (ZeroPadding2D)   (None, None, None, 1 0           normalization_3[0][0]            
_____________________________________________________________________________________

In [42]:
STRATEGY = tf.distribute.get_strategy()    
BATCH_SIZE = 16
IMG_SIZE = 600
SEED = 42

In [43]:
import tensorflow as tf
filenames = tf.io.gfile.glob(f'{input_path}/data/*.tfrec')

train_filenames = [filenames[0]]
val_filenames = [filenames[1]]

steps_per_epoch = count_data_items(train_filenames) // BATCH_SIZE
validation_steps = count_data_items(val_filenames) // BATCH_SIZE

train_dataset = get_dataset(train_filenames, shuffled=True, repeated=True, augmented=True)
val_dataset = get_dataset(val_filenames, cached=True)

train_dataset

<PrefetchDataset shapes: ((16, 600, 600, 1), (16, 15)), types: (tf.float32, tf.int64)>

In [44]:
filenames = tf.io.gfile.glob(f'{input_path}/data/*.tfrec')

train_filenames = [filenames[0]]
val_filenames = [filenames[1]]

steps_per_epoch = count_data_items(train_filenames) // BATCH_SIZE
validation_steps = count_data_items(val_filenames) // BATCH_SIZE

train_dataset = get_dataset(train_filenames, shuffled=True, repeated=True, augmented=True)
val_dataset = get_dataset(val_filenames, cached=True)

with STRATEGY.scope():
    model = get_model()
    
history = model.fit(
    train_dataset,
    steps_per_epoch=steps_per_epoch,
    epochs=2,
    validation_data=val_dataset,
    validation_steps=validation_steps,
    verbose=1)

Epoch 1/2


2022-03-24 04:16:41.520408: W tensorflow/core/common_runtime/bfc_allocator.cc:272] Allocator (GPU_0_bfc) ran out of memory trying to allocate 2.85GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
2022-03-24 04:16:42.443309: W tensorflow/core/common_runtime/bfc_allocator.cc:272] Allocator (GPU_0_bfc) ran out of memory trying to allocate 1.68GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
2022-03-24 04:16:42.827327: W tensorflow/core/common_runtime/bfc_allocator.cc:272] Allocator (GPU_0_bfc) ran out of memory trying to allocate 1.26GiB with freed_by_count=0. The caller indicates that this is not a failure, but may mean that there could be performance gains if more memory were available.
2022-03-24 04:16:43.368663: W tensorflow/core/common_runtime/bfc_allocator.cc:272] Alloc

ResourceExhaustedError: 2 root error(s) found.
  (0) Resource exhausted:  OOM when allocating tensor with shape[16,672,37,37] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[node sequential_2/efficientnetb0/block5b_dwconv/depthwise (defined at tmp/ipykernel_75/2410746617.py:15) ]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.

	 [[assert_less_equal/Assert/AssertGuard/pivot_f/_13/_35]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.

  (1) Resource exhausted:  OOM when allocating tensor with shape[16,672,37,37] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc
	 [[node sequential_2/efficientnetb0/block5b_dwconv/depthwise (defined at tmp/ipykernel_75/2410746617.py:15) ]]
Hint: If you want to see a list of allocated tensors when OOM happens, add report_tensor_allocations_upon_oom to RunOptions for current allocation info. This isn't available when running in Eager mode.

0 successful operations.
0 derived errors ignored. [Op:__inference_train_function_2303423]

Function call stack:
train_function -> train_function


In [None]:
model.save(f"{input_path}/models/EfficientNetB0.h5")

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=bcf52be9-3e5b-470d-bba7-96b3b47ba74c' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>