In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tqdm
import os
import glob
import cv2

## Dataloader

In [2]:
meta = pd.read_csv("/content/drive/My Drive/chestxray14/original_split/original_split_train.csv")
data_entry = pd.read_csv("/content/drive/My Drive/chestxray14/Data_Entry_2017.csv")

In [3]:
len(meta)

78484

In [None]:
meta.head()

Unnamed: 0,Image Index,Atelectasis,Cardiomegaly,Effusion,Infiltration,Mass,Nodule,Pneumonia,Pneumothorax,Consolidation,Edema,Emphysema,Fibrosis,Pleural_Thickening,Hernia,path,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],Unnamed: 11
0,00000003_000.png,0,0,0,0,0,0,0,0,0,0,0,0,0,1,/content/drive/My Drive/chestxray14/images512/...,Hernia,0,3,81,F,PA,2582,2991,0.143,0.143,
1,00000003_001.png,0,0,0,0,0,0,0,0,0,0,0,0,0,1,/content/drive/My Drive/chestxray14/images512/...,Hernia,1,3,74,F,PA,2500,2048,0.168,0.168,
2,00000003_002.png,0,0,0,0,0,0,0,0,0,0,0,0,0,1,/content/drive/My Drive/chestxray14/images512/...,Hernia,2,3,75,F,PA,2048,2500,0.168,0.168,
3,00000003_003.png,0,0,0,1,0,0,0,0,0,0,0,0,0,1,/content/drive/My Drive/chestxray14/images512/...,Hernia|Infiltration,3,3,76,F,PA,2698,2991,0.143,0.143,
4,00000003_004.png,0,0,0,0,0,0,0,0,0,0,0,0,0,1,/content/drive/My Drive/chestxray14/images512/...,Hernia,4,3,77,F,PA,2500,2048,0.168,0.168,


In [None]:
data_entry.head()

Unnamed: 0,Image Index,Finding Labels,Follow-up #,Patient ID,Patient Age,Patient Gender,View Position,OriginalImage[Width,Height],OriginalImagePixelSpacing[x,y],Unnamed: 11
0,00000001_000.png,Cardiomegaly,0,1,58,M,PA,2682,2749,0.143,0.143,
1,00000001_001.png,Cardiomegaly|Emphysema,1,1,58,M,PA,2894,2729,0.143,0.143,
2,00000001_002.png,Cardiomegaly|Effusion,2,1,58,M,PA,2500,2048,0.168,0.168,
3,00000002_000.png,No Finding,0,2,81,M,PA,2500,2048,0.171,0.171,
4,00000003_000.png,Hernia,0,3,81,F,PA,2582,2991,0.143,0.143,


In [None]:
def generate_path(row):
  return '/content/drive/My Drive/chestxray14/images512/{}'.format(
      row['Image Index']
  )

In [None]:
meta['path'] = meta.apply(lambda x : generate_path(x), axis=1)

In [None]:
meta = pd.merge(meta, data_entry, on='Image Index')

In [None]:
meta = meta[meta['View Position'] == 'AP']

In [None]:
meta_labels = meta.iloc[:, 1:15]

In [None]:
data_ds = tf.data.Dataset.from_tensor_slices((meta['path'],np.array(meta_labels), meta['Image Index']))

In [None]:
def decode_img(img):
  img = tf.image.decode_png(img, channels=3)
  img = tf.image.convert_image_dtype(img, tf.float32)
  return tf.image.resize(img, (224,224))

def process_path(path, label, sid):
  img = tf.io.read_file(path)
  img = decode_img(img)
  return img, label, sid

In [None]:
data_im = data_ds.map(process_path, num_parallel_calls=tf.data.experimental.AUTOTUNE)\
    .apply(tf.data.experimental.ignore_errors()).cache('/content/train_cache')

### Create TFRecord

In [None]:
len(meta)

2824

In [None]:
# TR_SHARDS = 10
TR_SHARDS = 3
GCS_OUTPUT = '/content/drive/Shared drives/CMB - corpora/Chest_x-ray_report_Jan2020/Chest Xray14 Tfrecord/val_ap/' 

In [None]:
import math

data_size = len(meta)
shard_size = math.ceil(1.0 * data_size / TR_SHARDS)

In [None]:
shard_size

942

In [None]:
def recompress_image(image, label, sid):
  image = image * 255
  image = tf.cast(image, tf.uint8)
  image = tf.image.encode_jpeg(image, optimize_size=True, chroma_downsampling=False)
  return image, label, sid

In [None]:
data_imc = data_im.map(recompress_image, num_parallel_calls=tf.data.experimental.AUTOTUNE)
data_imc = data_imc.batch(shard_size).prefetch(tf.data.experimental.AUTOTUNE)

In [None]:
# Three types of data can be stored in TFRecords: bytestrings, integers and floats
# They are always stored as lists, a single data element will be a list of size 1

def _bytestring_feature(list_of_bytestrings):
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=list_of_bytestrings))

def _int_feature(list_of_ints): # int64
  return tf.train.Feature(int64_list=tf.train.Int64List(value=list_of_ints))

def _float_feature(list_of_floats): # float32
  return tf.train.Feature(float_list=tf.train.FloatList(value=list_of_floats))
  

def to_tfrecord(tfrec_filewriter, img, label, sid):  
  
  feature = {
      "image": _bytestring_feature([img]), 
      "label":     _float_feature(label.reshape(-1)), 
      "image_index": _bytestring_feature([sid])
    }
  return tf.train.Example(features=tf.train.Features(feature=feature))


In [None]:
print("Writing TFRecords")
for shard, (image, label, sid) in tqdm.tqdm(enumerate(data_imc)):
  # batch size used as shard size here
  shard_size = image.numpy().shape[0]
  # good practice to have the number of records in the filename
  filename = GCS_OUTPUT + "{:02d}-{}.tfrec".format(shard, shard_size)
  if os.path.exists(filename): 
    continue
  
  with tf.io.TFRecordWriter(filename) as out_file:
    for i in tqdm.tqdm(range(shard_size)):
      example = to_tfrecord(out_file,
                            image.numpy()[i],
                            label.numpy()[i],
                            sid.numpy()[i])
      out_file.write(example.SerializeToString())
    print("Wrote file {} containing {} records".format(filename, shard_size))

0it [00:00, ?it/s]

Writing TFRecords



  0%|          | 0/942 [00:00<?, ?it/s][A
  1%|          | 10/942 [00:00<00:09, 93.43it/s][A
  2%|▏         | 20/942 [00:00<00:10, 91.63it/s][A
  3%|▎         | 29/942 [00:00<00:10, 89.26it/s][A
  4%|▍         | 38/942 [00:00<00:10, 85.54it/s][A
  5%|▌         | 48/942 [00:00<00:10, 87.53it/s][A
  6%|▌         | 58/942 [00:00<00:09, 90.31it/s][A
  7%|▋         | 68/942 [00:00<00:09, 92.50it/s][A
  8%|▊         | 78/942 [00:00<00:09, 93.53it/s][A
  9%|▉         | 87/942 [00:00<00:09, 92.00it/s][A
 10%|█         | 97/942 [00:01<00:09, 93.47it/s][A
 11%|█▏        | 108/942 [00:01<00:08, 95.27it/s][A
 13%|█▎        | 118/942 [00:01<00:09, 91.12it/s][A
 14%|█▎        | 128/942 [00:01<00:08, 93.38it/s][A
 15%|█▍        | 138/942 [00:01<00:08, 91.12it/s][A
 16%|█▌        | 148/942 [00:01<00:08, 90.26it/s][A
 17%|█▋        | 159/942 [00:01<00:08, 93.91it/s][A
 18%|█▊        | 170/942 [00:01<00:07, 96.75it/s][A
 19%|█▉        | 180/942 [00:01<00:07, 95.52it/s][A
 20%|██     

Wrote file /content/drive/Shared drives/CMB - corpora/Chest_x-ray_report_Jan2020/Chest Xray14 Tfrecord/val_ap/00-942.tfrec containing 942 records



  0%|          | 0/942 [00:00<?, ?it/s][A
  1%|          | 11/942 [00:00<00:08, 107.88it/s][A
  2%|▏         | 22/942 [00:00<00:08, 106.36it/s][A
  4%|▎         | 33/942 [00:00<00:08, 103.88it/s][A
  4%|▍         | 42/942 [00:00<00:09, 98.71it/s] [A
  5%|▌         | 51/942 [00:00<00:09, 95.04it/s][A
  6%|▋         | 61/942 [00:00<00:09, 96.40it/s][A
  8%|▊         | 72/942 [00:00<00:08, 97.86it/s][A
  9%|▊         | 82/942 [00:00<00:08, 96.15it/s][A
 10%|▉         | 92/942 [00:00<00:09, 93.55it/s][A
 11%|█         | 103/942 [00:01<00:08, 96.01it/s][A
 12%|█▏        | 114/942 [00:01<00:08, 98.14it/s][A
 13%|█▎        | 124/942 [00:01<00:08, 94.63it/s][A
 14%|█▍        | 134/942 [00:01<00:09, 85.27it/s][A
 15%|█▌        | 145/942 [00:01<00:08, 89.56it/s][A
 17%|█▋        | 156/942 [00:01<00:08, 93.00it/s][A
 18%|█▊        | 166/942 [00:01<00:08, 92.18it/s][A
 19%|█▊        | 176/942 [00:01<00:08, 89.52it/s][A
 20%|█▉        | 186/942 [00:01<00:08, 91.92it/s][A
 21%|██

Wrote file /content/drive/Shared drives/CMB - corpora/Chest_x-ray_report_Jan2020/Chest Xray14 Tfrecord/val_ap/01-942.tfrec containing 942 records



  0%|          | 0/910 [00:00<?, ?it/s][A
  1%|          | 9/910 [00:00<00:10, 84.44it/s][A
  2%|▏         | 19/910 [00:00<00:10, 86.49it/s][A
  3%|▎         | 28/910 [00:00<00:10, 87.38it/s][A
  4%|▍         | 37/910 [00:00<00:10, 85.77it/s][A
  5%|▍         | 45/910 [00:00<00:10, 82.32it/s][A
  6%|▌         | 52/910 [00:00<00:11, 77.89it/s][A
  7%|▋         | 60/910 [00:00<00:11, 77.00it/s][A
  7%|▋         | 68/910 [00:00<00:11, 75.86it/s][A
  8%|▊         | 76/910 [00:00<00:11, 75.01it/s][A
  9%|▉         | 84/910 [00:01<00:11, 74.73it/s][A
 10%|█         | 92/910 [00:01<00:10, 74.97it/s][A
 11%|█         | 101/910 [00:01<00:10, 76.79it/s][A
 12%|█▏        | 110/910 [00:01<00:10, 78.03it/s][A
 13%|█▎        | 119/910 [00:01<00:10, 78.85it/s][A
 14%|█▍        | 127/910 [00:01<00:09, 78.84it/s][A
 15%|█▍        | 135/910 [00:01<00:10, 77.50it/s][A
 16%|█▌        | 143/910 [00:01<00:09, 77.90it/s][A
 17%|█▋        | 151/910 [00:01<00:09, 77.66it/s][A
 17%|█▋       

Wrote file /content/drive/Shared drives/CMB - corpora/Chest_x-ray_report_Jan2020/Chest Xray14 Tfrecord/val_ap/02-910.tfrec containing 910 records


3it [43:11, 863.77s/it] 


In [None]:
def read_tfrecord(example):
    features = {
        "image": tf.io.FixedLenFeature([], tf.string),  # tf.string = bytestring (not text string)
        "label": tf.io.VarLenFeature(tf.float32)        # a certain number of floats0
        "size":  tf.io.FixedLenFeature([2], tf.int64),  # two integers
    }
    # decode the TFRecord
    example = tf.io.parse_single_example(example, features)
    
    # FixedLenFeature fields are now ready to use: exmple['size']
    # VarLenFeature fields require additional sparse_to_dense decoding
    
    image = tf.image.decode_jpeg(example['image'], channels=3)
    label  = tf.reshape(tf.sparse.to_dense(example['label']), [-1, 14])
    height = example['size'][0]
    width  = example['size'][1]
    image = tf.reshape(image, [height, width, 3])
    return image, label

In [None]:
# read from TFRecords. For optimal performance, read from multiple
# TFRecord files at once and set the option experimental_deterministic = False
# to allow order-altering optimizations.

option_no_order = tf.data.Options()
option_no_order.experimental_deterministic = False

filenames = tf.io.gfile.glob(GCS_OUTPUT + "*.tfrec")
train_dsr = tf.data.TFRecordDataset(filenames, num_parallel_reads=tf.data.experimental.AUTOTUNE)
train_dsr = train_dsr.with_options(option_no_order)
train_dsr = train_dsr.map(read_tfrecord, num_parallel_calls=tf.data.experimental.AUTOTUNE)
train_dsr = train_dsr.shuffle(300)