In [1]:
import pandas as pd
import numpy as np
import tqdm
import os
import glob
import cv2

## Dataloader

### View Matching

In [2]:
import tensorflow as tf

In [13]:
train = pd.read_csv("/content/drive/My Drive/AG-CNN/dataset/test_set_both.csv")

In [None]:
len(train[train['ViewPosition'] == 'LL'])

5403

In [14]:
def generate_path(row):
  folder = str(int(row['subject_id']))
  study_id = str(int(row['study_id']))
  dicom = row['dicom_id']
  return '/content/drive/My Drive/mimic-cxr/images512/p{}/p{}/s{}/{}.png'.format(
      folder[:2],
      folder,
      study_id,
      dicom
  )

In [15]:
train['path'] = train.apply(lambda x : generate_path(x), axis=1)

In [16]:
train_g = train.groupby('study_id')

In [17]:
import random 

def aggregate_images(df):
  if len(df) > 1:
    pa = df[(df['ViewPosition'] == 'PA')]['path']
    ap = df[(df['ViewPosition'] == 'AP')]['path']
    ll =  df[(df['ViewPosition'] == 'LL')]['path']
    la =  df[(df['ViewPosition'] == 'LATERAL')]['path']
    views = [ap, pa, ll, la]
    if (len(pa) == 0 and len(ap) == 0) or (len(ll) == 0 and len(la) == 0):
      return None
    
    result_frontal = []
    result_side = []
    views = []

    for path in ap.values:
      for path_l in ll.values:
        result_frontal.append(path)
        result_side.append(path_l)
        views.append('ap_ll')
        
      for path_l in la.values:
        result_frontal.append(path)
        result_side.append(path_l)
        views.append('ap_la')
    
    for path in pa.values:
      for path_l in ll.values:
        result_frontal.append(path)
        result_side.append(path_l)
        views.append('pa_ll')
        
      for path_l in la.values:
        result_frontal.append(path)
        result_side.append(path_l)
        views.append('pa_la')

    selected = random.randint(0, len(views) - 1)
    select = np.zeros(len(views), dtype=np.int8).tolist()
    select[selected] = 1

    pre_result = pd.DataFrame({
          'study_id' : df.iloc[0, 3],
          'frontal': result_frontal,
          'side': result_side,
          'view': views,
          'select': select
      }, columns = ['study_id', 'frontal','side', 'view', 'select'])
    return pre_result
  return None

In [18]:
train_g = train_g.apply(aggregate_images)

In [19]:
train_g.reset_index(drop=True, inplace=True)

In [20]:
train_l = train.groupby('study_id')

In [21]:
train_l = train_l.first()

In [22]:
train_merge = pd.merge(train_l, train_g,left_on='study_id', right_on='study_id', how='right')

In [None]:
train['ViewPosition'].value_counts()

AP          75080
PA          66968
LATERAL     58130
LL          24302
LAO             3
RAO             3
PA LLD          2
AP AXIAL        2
AP LLD          2
AP RLD          1
SWIMMERS        1
Name: ViewPosition, dtype: int64

In [None]:
train_merge['view'].value_counts()

pa_la    48147
pa_ll    22473
ap_la    17035
ap_ll     2130
Name: view, dtype: int64

In [24]:
train_merge.to_csv("/content/drive/My Drive/AG-CNN/dataset/test_set_merge.csv", index=False, header=True)

In [23]:
train_merge.head()

Unnamed: 0.1,study_id,Unnamed: 0,dicom_id,subject_id,ViewPosition,Atelectasis,Cardiomegaly,Consolidation,Edema,Enlarged Cardiomediastinum,Fracture,Lung Lesion,Lung Opacity,No Finding,Pleural Effusion,Pleural Other,Pneumonia,Pneumothorax,Support Devices,path,frontal,side,view,select
0,50000028,376819,1ff9650f-4cb5af18-f6caef33-e53686b7-983cca76,19995127,LATERAL,,,,,,,1.0,,,,,,,,/content/drive/My Drive/mimic-cxr/images512/p1...,/content/drive/My Drive/mimic-cxr/images512/p1...,/content/drive/My Drive/mimic-cxr/images512/p1...,pa_la,1
1,50000230,57297,7e962a95-d661c0db-4769286c-e150a106-fb9586c6,11550925,PA,1.0,,,,,,,,,,,,,,/content/drive/My Drive/mimic-cxr/images512/p1...,/content/drive/My Drive/mimic-cxr/images512/p1...,/content/drive/My Drive/mimic-cxr/images512/p1...,pa_la,1
2,50000766,37737,5fbd6d46-6ebe9e7e-ece0d766-43ebe17f-c2605f24,11027112,PA,1.0,1.0,,1.0,,,,,,0.0,,,,,/content/drive/My Drive/mimic-cxr/images512/p1...,/content/drive/My Drive/mimic-cxr/images512/p1...,/content/drive/My Drive/mimic-cxr/images512/p1...,pa_ll,1
3,50000936,75474,1b4edfd9-223f91d2-eb1ec8da-f98e2968-15ead25f,12008045,LATERAL,,,,,,,,,1.0,,,,,,/content/drive/My Drive/mimic-cxr/images512/p1...,/content/drive/My Drive/mimic-cxr/images512/p1...,/content/drive/My Drive/mimic-cxr/images512/p1...,pa_la,1
4,50001481,221570,02d065af-d0785395-162a5490-e0e1143e-d24656af,15868528,LATERAL,,,,,,,,,1.0,,,,,,/content/drive/My Drive/mimic-cxr/images512/p1...,/content/drive/My Drive/mimic-cxr/images512/p1...,/content/drive/My Drive/mimic-cxr/images512/p1...,pa_la,1


In [None]:
train_labels = train_merge.iloc[:, 5:19]

### Image Pre-processing

In [None]:
# train_la = train[(train['ViewPosition'] == 'LL')]

In [None]:
# train_labels = train_la.iloc[:,5:19]

In [None]:
# train_labels.fillna(0, inplace=True)

# train_labels.replace(-1, 0, inplace=True)

In [None]:
train_ds = tf.data.Dataset.from_tensor_slices((train_la['path'],np.array(train_labels), train_la['study_id']))

In [None]:
train_ds = tf.data.Dataset.from_tensor_slices((train_merge['study_id'],train_merge['frontal'], train_merge['side'], train_merge['view'],train_merge['select'], np.array(train_labels)))

In [None]:
def decode_img(img):
  img = tf.image.decode_png(img, channels=3)
  img = tf.image.convert_image_dtype(img, tf.float32)
  return tf.image.resize(img, (224,224))

def process_path(path, label, sid):
  img = tf.io.read_file(path)
  img = decode_img(img)
  return img, label, sid

def process_path_mul(sid, front, side, view, select, label):
  img_front = tf.io.read_file(front)
  img_front = decode_img(img_front)

  img_side = tf.io.read_file(side)
  img_side = decode_img(img_side)

  return img_front, img_side, label, view, select, sid

In [None]:
train_im = train_ds.map(process_path, num_parallel_calls=tf.data.experimental.AUTOTUNE)\
    .apply(tf.data.experimental.ignore_errors()).cache('/content/train_cache')

In [None]:
train_im = train_ds.map(process_path_mul, num_parallel_calls=tf.data.experimental.AUTOTUNE)\
    .apply(tf.data.experimental.ignore_errors()).cache('/content/train_cache')

### Create TFRecord

In [None]:
len(train_merge)

19122

In [None]:
BS = 64
# TR_SHARDS = 10
TR_SHARDS = 10
GCS_OUTPUT = '/content/drive/My Drive/AG-CNN/tfrecord_train_mul_nr/' 

In [None]:
import math

tr_size = len(train_merge)
tr_shard_size = math.ceil(1.0 * tr_size / TR_SHARDS)

In [None]:
tr_shard_size

2245

In [None]:
def recompress_image(image, label, sid):
  image = image * 255
  image = tf.cast(image, tf.uint8)
  image = tf.image.encode_jpeg(image, optimize_size=True, chroma_downsampling=False)
  return image, label, sid

In [None]:
def recompress_image_mul(image_front,image_side, label, view, select, sid):
  image_front = image_front * 255
  image_front = tf.cast(image_front, tf.uint8)
  image_front = tf.image.encode_jpeg(image_front, optimize_size=True, chroma_downsampling=False)

  image_side = image_side * 255
  image_side = tf.cast(image_side, tf.uint8)
  image_side = tf.image.encode_jpeg(image_side, optimize_size=True, chroma_downsampling=False)
  return image_front, image_side, label, view, select, sid

In [None]:
train_imc = train_im.map(recompress_image_mul, num_parallel_calls=tf.data.experimental.AUTOTUNE)
train_imc = train_imc.batch(tr_shard_size).prefetch(1)

In [None]:
# val_imc = val_im.map(recompress_image, num_parallel_calls=tf.data.experimental.AUTOTUNE)
# val_imc = val_imc.batch(val_shard_size).prefetch(1)

In [None]:
# Three types of data can be stored in TFRecords: bytestrings, integers and floats
# They are always stored as lists, a single data element will be a list of size 1

def _bytestring_feature(list_of_bytestrings):
  return tf.train.Feature(bytes_list=tf.train.BytesList(value=list_of_bytestrings))

def _int_feature(list_of_ints): # int64
  return tf.train.Feature(int64_list=tf.train.Int64List(value=list_of_ints))

def _float_feature(list_of_floats): # float32
  return tf.train.Feature(float_list=tf.train.FloatList(value=list_of_floats))
  

def to_tfrecord(tfrec_filewriter, img_front,img_side, label, view, select, sid):  
  
  feature = {
      "image_front": _bytestring_feature([img_front]), 
      "image_side": _bytestring_feature([img_side]), 
      "label":         _float_feature(label.reshape(-1)), 
      "view": _bytestring_feature([view]), 
      "select": _int_feature([select]),
      "study_id": _int_feature([sid])
    }
  return tf.train.Example(features=tf.train.Features(feature=feature))


In [None]:
print("Writing TFRecords")
for shard, (image_front, image_la, label,view, select, sid) in tqdm.tqdm(enumerate(train_imc, start=19)):
  # batch size used as shard size here
  shard_size = image_front.numpy().shape[0]
  # good practice to have the number of records in the filename
  filename = GCS_OUTPUT + "{:02d}-{}.tfrec".format(shard, shard_size)
  if os.path.exists(filename): 
    continue
  
  with tf.io.TFRecordWriter(filename) as out_file:
    for i in tqdm.tqdm(range(shard_size)):
      example = to_tfrecord(out_file,
                            image_front.numpy()[i],
                            image_la.numpy()[i],
                            label.numpy()[i],
                            view.numpy()[i],
                            select.numpy()[i],
                            sid.numpy()[i])
      out_file.write(example.SerializeToString())
    print("Wrote file {} containing {} records".format(filename, shard_size))

0it [00:00, ?it/s]

Writing TFRecords



  0%|          | 0/2245 [00:00<?, ?it/s][A
  0%|          | 1/2245 [00:01<53:28,  1.43s/it][A
  0%|          | 3/2245 [00:01<38:16,  1.02s/it][A
  0%|          | 5/2245 [00:01<27:34,  1.35it/s][A
  0%|          | 7/2245 [00:01<20:07,  1.85it/s][A
  0%|          | 8/2245 [00:06<1:09:03,  1.85s/it][A
  0%|          | 10/2245 [00:06<49:05,  1.32s/it] [A
  1%|          | 12/2245 [00:07<35:06,  1.06it/s][A
  1%|          | 14/2245 [00:07<25:22,  1.47it/s][A
  1%|          | 16/2245 [00:07<18:33,  2.00it/s][A
  1%|          | 18/2245 [00:07<13:44,  2.70it/s][A
  1%|          | 20/2245 [00:07<10:24,  3.56it/s][A
  1%|          | 22/2245 [00:07<08:14,  4.50it/s][A
  1%|          | 24/2245 [00:10<18:53,  1.96it/s][A
  1%|          | 25/2245 [00:10<14:19,  2.58it/s][A
  1%|          | 27/2245 [00:10<10:58,  3.37it/s][A
  1%|▏         | 29/2245 [00:10<08:35,  4.30it/s][A
  1%|▏         | 31/2245 [00:10<06:56,  5.32it/s][A
  1%|▏         | 33/2245 [00:10<05:46,  6.38it/s][A
  2

Wrote file /content/drive/My Drive/AG-CNN/tfrecord_train_mul_nr/19-2245.tfrec containing 2245 records



  0%|          | 0/2245 [00:00<?, ?it/s][A
  0%|          | 1/2245 [00:02<1:27:19,  2.33s/it][A
  0%|          | 2/2245 [00:04<1:29:29,  2.39s/it][A
  0%|          | 4/2245 [00:05<1:03:24,  1.70s/it][A
  0%|          | 6/2245 [00:05<45:05,  1.21s/it]  [A
  0%|          | 8/2245 [00:05<32:18,  1.15it/s][A
  0%|          | 10/2245 [00:05<23:24,  1.59it/s][A
  1%|          | 12/2245 [00:06<23:24,  1.59it/s][A
  1%|          | 14/2245 [00:07<23:19,  1.59it/s][A
  1%|          | 16/2245 [00:08<17:09,  2.17it/s][A
  1%|          | 18/2245 [00:08<12:45,  2.91it/s][A
  1%|          | 20/2245 [00:08<09:45,  3.80it/s][A
  1%|          | 22/2245 [00:09<13:47,  2.69it/s][A
  1%|          | 24/2245 [00:10<16:35,  2.23it/s][A
  1%|          | 25/2245 [00:13<40:11,  1.09s/it][A
  1%|          | 27/2245 [00:13<28:51,  1.28it/s][A
  1%|▏         | 29/2245 [00:13<20:54,  1.77it/s][A
  1%|▏         | 31/2245 [00:13<15:22,  2.40it/s][A
  1%|▏         | 33/2245 [00:16<24:17,  1.52it/s][

Wrote file /content/drive/My Drive/AG-CNN/tfrecord_train_mul_nr/20-2245.tfrec containing 2245 records



  0%|          | 0/2245 [00:00<?, ?it/s][A
  0%|          | 1/2245 [00:03<2:20:57,  3.77s/it][A
  0%|          | 3/2245 [00:03<1:39:26,  2.66s/it][A
  0%|          | 5/2245 [00:04<1:10:20,  1.88s/it][A
  0%|          | 7/2245 [00:04<49:59,  1.34s/it]  [A
  0%|          | 8/2245 [00:05<48:37,  1.30s/it][A
  0%|          | 10/2245 [00:05<34:46,  1.07it/s][A
  1%|          | 12/2245 [00:05<25:03,  1.49it/s][A
  1%|          | 14/2245 [00:05<18:19,  2.03it/s][A
  1%|          | 16/2245 [00:05<13:34,  2.74it/s][A
  1%|          | 18/2245 [00:07<16:34,  2.24it/s][A
  1%|          | 20/2245 [00:07<12:21,  3.00it/s][A
  1%|          | 22/2245 [00:07<09:26,  3.93it/s][A
  1%|          | 24/2245 [00:07<07:26,  4.97it/s][A
  1%|          | 26/2245 [00:07<05:58,  6.19it/s][A
  1%|          | 28/2245 [00:07<04:55,  7.50it/s][A
  1%|▏         | 30/2245 [00:08<04:11,  8.81it/s][A
  1%|▏         | 32/2245 [00:08<03:47,  9.74it/s][A
  2%|▏         | 34/2245 [00:08<03:24, 10.81it/s][

Wrote file /content/drive/My Drive/AG-CNN/tfrecord_train_mul_nr/21-2245.tfrec containing 2245 records



  0%|          | 0/2245 [00:00<?, ?it/s][A
  0%|          | 1/2245 [00:00<04:29,  8.34it/s][A
  0%|          | 3/2245 [00:00<03:58,  9.39it/s][A
  0%|          | 5/2245 [00:00<03:39, 10.22it/s][A
  0%|          | 7/2245 [00:00<03:24, 10.93it/s][A
  0%|          | 9/2245 [00:00<03:12, 11.63it/s][A
  0%|          | 11/2245 [00:00<03:03, 12.19it/s][A
  1%|          | 13/2245 [00:02<09:56,  3.74it/s][A
  1%|          | 15/2245 [00:02<07:46,  4.78it/s][A
  1%|          | 17/2245 [00:06<27:26,  1.35it/s][A
  1%|          | 19/2245 [00:06<20:05,  1.85it/s][A
  1%|          | 20/2245 [00:07<28:20,  1.31it/s][A
  1%|          | 22/2245 [00:07<20:39,  1.79it/s][A
  1%|          | 23/2245 [00:09<28:30,  1.30it/s][A
  1%|          | 25/2245 [00:09<20:45,  1.78it/s][A
  1%|          | 27/2245 [00:09<15:18,  2.41it/s][A
  1%|▏         | 29/2245 [00:09<11:36,  3.18it/s][A
  1%|▏         | 31/2245 [00:11<15:30,  2.38it/s][A
  1%|▏         | 33/2245 [00:11<11:39,  3.16it/s][A
  2%|▏

Wrote file /content/drive/My Drive/AG-CNN/tfrecord_train_mul_nr/22-2245.tfrec containing 2245 records


In [None]:
def read_tfrecord(example):
    features = {
        "image": tf.io.FixedLenFeature([], tf.string),  # tf.string = bytestring (not text string)
        "label": tf.io.VarLenFeature(tf.float32)        # a certain number of floats0
        "size":  tf.io.FixedLenFeature([2], tf.int64),  # two integers
    }
    # decode the TFRecord
    example = tf.io.parse_single_example(example, features)
    
    # FixedLenFeature fields are now ready to use: exmple['size']
    # VarLenFeature fields require additional sparse_to_dense decoding
    
    image = tf.image.decode_jpeg(example['image'], channels=3)
    label  = tf.reshape(tf.sparse.to_dense(example['label']), [-1, 14])
    height = example['size'][0]
    width  = example['size'][1]
    image = tf.reshape(image, [height, width, 3])
    return image, label

In [None]:
# read from TFRecords. For optimal performance, read from multiple
# TFRecord files at once and set the option experimental_deterministic = False
# to allow order-altering optimizations.

option_no_order = tf.data.Options()
option_no_order.experimental_deterministic = False

filenames = tf.io.gfile.glob(GCS_OUTPUT + "*.tfrec")
train_dsr = tf.data.TFRecordDataset(filenames, num_parallel_reads=tf.data.experimental.AUTOTUNE)
train_dsr = train_dsr.with_options(option_no_order)
train_dsr = train_dsr.map(read_tfrecord, num_parallel_calls=tf.data.experimental.AUTOTUNE)
train_dsr = train_dsr.shuffle(300)