In [1]:
%cd ~/ChestXray-14/

/home/jovyan/ChestXray-14


## Start

In [2]:
"""
Chonsawat Path: input_path = "/content/drive/MyDrive/KKU /Project/Dataset/ChestXray NIH"
Deepnote Path: input_path = "/datasets/chonsawat-drive/KKU /Project/Dataset/ChestXray NIH"
Elab Path: input_path = "~/ChestXray-14/dataset/ChestXray NIH"
"""
input_path = "dataset/ChestXray NIH"

In [3]:
from sklearn.utils import shuffle
from tqdm.notebook import tqdm
import tensorflow as tf
import pandas as pd
import numpy as np
import os

In [4]:
STRATEGY = tf.distribute.get_strategy()    
BATCH_SIZE = 16
IMG_SIZE = 224
SEED = 42
    
print('Using tensorflow %s' % tf.__version__)

Using tensorflow 2.6.2


## 2. Serialization

In [4]:
def _serialize_image(path):
    image = tf.io.read_file(path)
    image = tf.image.decode_png(image, channels=1)
    image = tf.image.resize(image, [IMG_SIZE, IMG_SIZE])
    image = tf.cast(image, tf.uint8)
    return tf.image.encode_jpeg(image).numpy()


def _serialize_sample(image_id, image, proba):
    feature = {
        'image': tf.train.Feature(bytes_list=tf.train.BytesList(value=[image])),
        'image_id': tf.train.Feature(bytes_list=tf.train.BytesList(value=[image_id])),
        'No Finding': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[0]])),
        'Atelectasis': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[1]])),
        'Consolidation': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[2]])),
        'Infiltration': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[3]])),
        'Pneumothorax': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[4]])),
        'Edema': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[5]])),
        'Emphysema': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[6]])),
        'Fibrosis': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[7]])),
        'Effusion': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[8]])),
        'Pneumonia': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[9]])),
        'Pleural_Thickening': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[10]])),
        'Cardiomegaly': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[11]])),
        'Nodule': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[12]])),
        'Mass': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[13]])),
        'Hernia': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[14]]))}
    sample = tf.train.Example(features=tf.train.Features(feature=feature))
    return sample.SerializeToString()
        
        
def serialize_fold(fold, name):
    samples = []
    
    for index, proba in fold.iterrows():
        samples.append(_serialize_sample(
            index.split('/')[-1].encode(), 
            _serialize_image(index), 
            proba))
    
    with tf.io.TFRecordWriter(name + '.tfrec') as writer:
        [writer.write(x) for x in samples]

In [5]:
df = pd.read_csv(f"{input_path}/preprocessed_data.csv", index_col=0)
df

Unnamed: 0,No Finding,Atelectasis,Consolidation,Infiltration,Pneumothorax,Edema,Emphysema,Fibrosis,Effusion,Pneumonia,Pleural_Thickening,Cardiomegaly,Nodule,Mass,Hernia
dataset/ChestXray NIH/images_001/images/00000001_000.png,False,False,False,False,False,False,False,False,False,False,False,True,False,False,False
dataset/ChestXray NIH/images_001/images/00000001_001.png,False,False,False,False,False,False,True,False,False,False,False,True,False,False,False
dataset/ChestXray NIH/images_001/images/00000001_002.png,False,False,False,False,False,False,False,False,True,False,False,True,False,False,False
dataset/ChestXray NIH/images_001/images/00000002_000.png,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False
dataset/ChestXray NIH/images_001/images/00000003_000.png,False,False,False,False,False,False,False,False,False,False,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
dataset/ChestXray NIH/images_012/images/00030801_001.png,False,False,False,False,False,False,False,False,False,True,False,False,False,True,False
dataset/ChestXray NIH/images_012/images/00030802_000.png,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False
dataset/ChestXray NIH/images_012/images/00030803_000.png,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False
dataset/ChestXray NIH/images_012/images/00030804_000.png,True,False,False,False,False,False,False,False,False,False,False,False,False,False,False


In [7]:
SEED = 42
df = shuffle(df, random_state=SEED)
df_train = df[:78484] # 70%
df_valid = df[78484:100908] # 20%
df_test = df[100908:] # 10%

### อ่าน Files ใน Data
สร้างโฟลเดอร์
```python
os.mkdir(f'{input_path}/data')
```

ลองอ่านไฟล์จากโฟลเดอร์
```python
try: serialize_list = os.listdir(f"{input_path}/data/224x224/train") 
except: serialize_list = []
```

In [10]:
IMG_SIZE = 224
folds = 256

tfrec_path = f'{input_path}/data/224x224/train'
for i, fold in tqdm(enumerate(np.array_split(df_train, folds)), total=folds):
    serialize_fold(fold, name=f'{tfrec_path}/{i:03d}-{len(fold):03d}')
    
tfrec_path = f'{input_path}/data/224x224/valid'
for i, fold in tqdm(enumerate(np.array_split(df_valid, folds)), total=folds):
    serialize_fold(fold, name=f'{tfrec_path}/{i:03d}-{len(fold):03d}')

tfrec_path = f'{input_path}/data/224x224/test'
for i, fold in tqdm(enumerate(np.array_split(df_test, folds)), total=folds):
    serialize_fold(fold, name=f'{tfrec_path}/{i:03d}-{len(fold):03d}')

  0%|          | 0/256 [00:00<?, ?it/s]

  'No Finding': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[0]])),
  'Atelectasis': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[1]])),
  'Consolidation': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[2]])),
  'Infiltration': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[3]])),
  'Pneumothorax': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[4]])),
  'Edema': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[5]])),
  'Emphysema': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[6]])),
  'Fibrosis': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[7]])),
  'Effusion': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[8]])),
  'Pneumonia': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[9]])),
  'Pleural_Thickening': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[10]])),
  'Cardiomegaly': tf.train.Feature(int64_list=tf.train.Int64List(value=[proba[11]

  0%|          | 0/256 [00:00<?, ?it/s]

  0%|          | 0/256 [00:00<?, ?it/s]



In [11]:
import tensorflow as tf

In [12]:
feature_map = {
    'image': tf.io.FixedLenFeature([], tf.string),
    'image_id': tf.io.FixedLenFeature([], tf.string),
    'No Finding': tf.io.FixedLenFeature([], tf.int64),
    'Atelectasis': tf.io.FixedLenFeature([], tf.int64),
    'Consolidation': tf.io.FixedLenFeature([], tf.int64),
    'Infiltration': tf.io.FixedLenFeature([], tf.int64),
    'Pneumothorax': tf.io.FixedLenFeature([], tf.int64),
    'Edema': tf.io.FixedLenFeature([], tf.int64),
    'Emphysema': tf.io.FixedLenFeature([], tf.int64),
    'Fibrosis': tf.io.FixedLenFeature([], tf.int64),
    'Effusion': tf.io.FixedLenFeature([], tf.int64),
    'Pneumonia': tf.io.FixedLenFeature([], tf.int64),
    'Pleural_Thickening': tf.io.FixedLenFeature([], tf.int64),
    'Cardiomegaly': tf.io.FixedLenFeature([], tf.int64),
    'Nodule': tf.io.FixedLenFeature([], tf.int64),
    'Mass': tf.io.FixedLenFeature([], tf.int64),
    'Hernia': tf.io.FixedLenFeature([], tf.int64)}


def count_data_items(filenames):
    return np.sum([int(x[:-6].split('-')[-1]) for x in filenames])


def decode_image(image_data):
    image = tf.image.decode_jpeg(image_data, channels=1)
    image = tf.reshape(image, [IMG_SIZE, IMG_SIZE, 1])
    return image


def scale_image(image, target):
    image = tf.cast(image, tf.float32) / 255.
    return image, target


def read_tfrecord(example):
    example = tf.io.parse_single_example(example, feature_map)
    image = decode_image(example['image'])
    target = [
        example['No Finding'],
        example['Atelectasis'],
        example['Consolidation'],
        example['Infiltration'],
        example['Pneumothorax'],
        example['Edema'],
        example['Emphysema'],
        example['Fibrosis'],
        example['Effusion'],
        example['Pneumonia'],
        example['Pleural_Thickening'],
        example['Cardiomegaly'],
        example['Nodule'],
        example['Mass'],
        example['Hernia']]
    return image, target


def data_augment(image, target):
    image = tf.image.random_flip_left_right(image, seed=SEED)
    image = tf.image.random_flip_up_down(image, seed=SEED)
    return image, target


def get_dataset(filenames, shuffled=False, repeated=False, 
                cached=False, augmented=False, distributed=True):
    auto = tf.data.experimental.AUTOTUNE
    dataset = tf.data.TFRecordDataset(filenames, num_parallel_reads=auto)
    dataset = dataset.map(read_tfrecord, num_parallel_calls=auto)
    if augmented:
        dataset = dataset.map(data_augment, num_parallel_calls=auto)
    dataset = dataset.map(scale_image, num_parallel_calls=auto)
    if shuffled:
        dataset = dataset.shuffle(2048, seed=SEED)
    if repeated:
        dataset = dataset.repeat()
    dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)
    if cached:
        dataset = dataset.cache()
    dataset = dataset.prefetch(auto)
    if distributed:
        dataset = STRATEGY.experimental_distribute_dataset(dataset)
    return dataset


def get_model():
    model = tf.keras.models.Sequential([
        tf.keras.applications.EfficientNetB0(
            include_top=False,
            input_shape=(None, None, 1),
            weights=None,
            pooling='avg'),
        tf.keras.layers.Dense(15, activation='sigmoid')
    ])
    model.compile(
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=tf.keras.metrics.AUC(multi_label=True))

    return model

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=bcf52be9-3e5b-470d-bba7-96b3b47ba74c' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>