<a href="https://www.kaggle.com/code/davidjohnmillard/tfwriter-rsna?scriptVersionId=119799623" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

# RSNA TFWriter

The following is an approach to writing TFRecords to a Bucket in GCS.

# Imports/Setup

In [None]:
!pip install python-gdcm -q
!pip install pylibjpeg -q

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pydicom
import os
import cv2
import tensorflow as tf
import tensorflow_addons as tfa
from tensorflow import keras
assert tf.__version__ >= "2.11"
from google.cloud import storage
from pydicom.pixel_data_handlers.util import apply_voi_lut

In [None]:
tf.version.VERSION

In [None]:
path_test = '/kaggle/input/rsna-breast-cancer-detection/test.csv'
path_train = '/kaggle/input/rsna-breast-cancer-detection/train.csv'

In [None]:
dfte = pd.read_csv(path_test)
dftr = pd.read_csv(path_train)

# Handle the Data

First we need to split the training data into a training and validation set.

To get the image paths we can apply a function to the dataframe on each row to get the picture with patient_id and image_id.

Next we do some basic preproccessing to make sure each image is normalized.

In [None]:
def train_test_split(dataset):
    split = int(dataset.shape[0] * .8)
    return dataset[:split], dataset[split:]

In [None]:
def add_img_path_to_pd(row):
    return '/kaggle/input/rsna-breast-cancer-detection/train_images/' + str(row['patient_id']) + '/' + str(row['image_id']) + '.dcm'

# Image ROI

In [None]:
def crop_coords(img):
    """
    Crop ROI from image.
    """
    # Otsu's thresholding after Gaussian filtering
    blur = cv2.GaussianBlur(img, (5, 5), 0)
    _, breast_mask = cv2.threshold(blur,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)
    
    cnts, _ = cv2.findContours(breast_mask.astype(np.uint8), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    cnt = max(cnts, key = cv2.contourArea)
    x, y, w, h = cv2.boundingRect(cnt)
    return (x, y, w, h)

Normalize the data and clip the data to inlcude only between 5 < x < 99 percentile of data.

In [None]:
def truncation_normalization(img):
    Pmin = np.percentile(img[img!=0], 5)
    Pmax = np.percentile(img[img!=0], 99)
    truncated = np.clip(img, Pmin, Pmax)  
    normalized = (truncated - Pmin)/(Pmax - Pmin)
    normalized[img==0]=0
    return normalized

In [None]:
def clahe(img, clip):
    clahe = cv2.createCLAHE(clipLimit=clip)
    cl = clahe.apply(img)
    return cl

def parse_clahe(image):
    cl1 = clahe(image, 1.0)
    cl2 = clahe(image, 2.0)
    img_final = cv2.merge((image, cl1, cl2))
    return img_final

> smaller: (384, 672)

In [None]:
def preprocess(imagepath, voi=False):
    ds = pydicom.dcmread(imagepath)
    img = ds.pixel_array
    
    if voi:
        img = apply_voi_lut(img, ds)
    
    img_max = np.max(img)
    img = img / np.max(img)
    if ds.PhotometricInterpretation == "MONOCHROME1":
        img = 1 - img
    img = img * img_max
    img = img[..., tf.newaxis]
    
    (x, y, w, h) = crop_coords(img.astype("uint8"))
    img_cropped = img[y:y+h, x:x+w]
    
    img_normalized = truncation_normalization(img_cropped)
                                              
    img_final = cv2.resize(img_normalized, (int(768/2), int(1344/2)))
    
    img_final = np.array(img_final*255, dtype=np.uint8)
    img_final = img_final[..., tf.newaxis]
    #img_final = cv2.equalizeHist(img_final)
    
    # img_final = parse_clahe(img_final)
                                              
    return img_final

In [None]:
dftr[dftr['cancer'] == 1]

In [None]:
data_augm_lay = keras.Sequential(
    [
        keras.layers.RandomZoom(height_factor=(0, -0.3))
    ]
)

In [None]:
def augment_image(image, y):
    
    image = tf.image.random_brightness(image, 0.10)
    # image = tf.image.random_contrast(image, 0.90, 1.40)
    # image = tf.image.random_saturation(image, 0.50, 2.00)
    image = tf.image.random_flip_left_right(image)
    image = tf.image.random_flip_up_down(image)
    image = tf.image.random_jpeg_quality(image, 75, 100)
    
    image = data_augm_lay(image)
    
    # image = dropout(image)
    
    return image, y

In [None]:
image1 = preprocess('/kaggle/input/rsna-breast-cancer-detection/train_images/10130/388811999.dcm', voi=False)
plt.imshow(image1)

In [None]:
image2 = preprocess('/kaggle/input/rsna-breast-cancer-detection/train_images/10130/388811999.dcm', voi=True)
plt.imshow(image2)

In [None]:
image1.dtype

In [None]:
image3 = tf.io.encode_jpeg(image2)
image3 = tf.io.decode_jpeg(image3)

In [None]:
image3.shape

In [None]:
plt.imshow(image3)

In [None]:
# plt.imshow(augment_image(image2, 1)[0])

In [None]:
image2.shape

# Useful Stats Writing Data

This is not a EDA Notebook book but we do need some useful statistics to deal with data.

In [None]:
def useful_stats():
    print('pos train %: ' + str(dftr[dftr['cancer'] == 1].shape[0] / dftr.shape[0] * 100))
    print('pos valid %: ' + str(dfv[dfv['cancer'] == 1].shape[0] / dfv.shape[0] * 100))

# Dealing With Files locally and in the Cloud

We need to follow the sequence: 
> write to local file -> push to cloud -> delete from local -> repeat

In [None]:
def push_to_cloud(filepath, bucket_name):
    print('pushing ' + filepath + ' to cloud in bucket: train_batches')
    
    bucket = storage_client.get_bucket(bucket_name)
    blob = bucket.blob(filepath)
    blob.upload_from_filename('/kaggle/working/' + filepath)

In [None]:
def delete_file(filepath):
    print('deleting ' + filepath + ' from local')
    os.remove('/kaggle/working/' + filepath)
    
def clear_all_local():
    for k in os.listdir('/kaggle/working/'):
        if k == '.virtual_documents':
            continue
        delete_file(k)

# Create an Example

Create an example using the features necessary to the Network.

In [None]:
from tensorflow.train import BytesList, FloatList, Int64List
from tensorflow.train import Feature, Features, Example

def get_example(image, label, age, implant, laterality, view, diff_neg):
    return Example(
        features=Features(
            feature={
                'image': Feature(bytes_list=BytesList(value=[tf.io.encode_jpeg(image, optimize_size=True).numpy()])),
                'label': Feature(int64_list=Int64List(value=[label])),
                'age': Feature(int64_list=Int64List(value=[age])),
                'impant': Feature(int64_list=Int64List(value=[implant])),
                'laterality': Feature(bytes_list=BytesList(value=[laterality])),
                'view': Feature(bytes_list=BytesList(value=[view])),
                'diff_neg': Feature(int64_list=Int64List(value=[diff_neg]))
            }
        )
    )

# Write TFRecords

To write the files we need the dataset to read from, the name of the batches we are writing, and the bucket to write to.

The dataest is split into n batches and use offset incase connection is lost.

For every file in the n files created we create an example using the corresponding instance's values in the dataframe.

Finally the example is serialized and written it to the file.

This file is then pushed to gcs and deleted from the local system.

In [None]:
def write_tfrecords(name, dataset, bucket_name, n_shards, offset):
    paths = ["{}_batch_{:0>3}.tfrecord".format(name, index) for index in range(offset, n_shards)]
    
    for num, i in enumerate(paths):
        with tf.io.TFRecordWriter(i) as f: 
            print('writing to ' + i )
            for index, row in dataset[(num+offset)*(int(dataset.shape[0] / n_shards) + 1):(num+offset+1)*(int(dataset.shape[0] / n_shards) + 1)].iterrows():
                image = preprocess(row['imagepath'], voi=True)
                label = row['cancer']
                age = int(row['age'])
                implant = row['implant']
                laterality = bytes(row['laterality'], 'utf-8')
                view = bytes(row['view'], 'utf-8')
                diff_neg = 1 if row['difficult_negative_case'] else 0
                example = get_example(image, label, age, implant, laterality, view, diff_neg)
                f.write(example.SerializeToString())
                
        push_to_cloud(i, bucket_name)
        delete_file(i)
            
    return paths

# Define GCS Storage Area

In [None]:
client_area = 'kagglersna01'
storage_client = storage.Client(project=client_area)

# Apply Transformation to DataFrame

In [None]:
def setup_data(dirr, dataset):
    dataset.fillna(dataset['age'].median(), inplace=True)
    dataset['imagepath'] = dataset.apply(add_img_path_to_pd, axis=1)
    return train_test_split(dataset)
dftr, dfv = setup_data('train_images', dftr)

In [None]:
useful_stats()

# Run TFRecord Writer

In [None]:
def main_run(name, dataset, bucket_name, n_shards=20, offset=0):
    clear_all_local()
    write_tfrecords(name, dataset, bucket_name, n_shards, offset)

In [None]:
BUCKET_NAME = 'train_batches_smaller'
# main_run('train', dftr, BUCKET_NAME, n_shards=int(200/2), offset=0)
main_run('valid', dfv, BUCKET_NAME, n_shards=int(50/2), offset=8)

# Parse an Example Instance

In [None]:
def parse_example(tfrecord):
    feature_desc = {
        'image': tf.io.FixedLenFeature([], tf.string, default_value=""),
        'label': tf.io.FixedLenFeature([], tf.int64, default_value=-1),
        'age': tf.io.FixedLenFeature([], tf.int64, default_value=-1),
        'impant': tf.io.FixedLenFeature([], tf.int64, default_value=-1),
        'laterality': tf.io.FixedLenFeature([], tf.string, default_value=""),
        'view': tf.io.FixedLenFeature([], tf.string, default_value=""),
        'diff_neg': tf.io.FixedLenFeature([], tf.int64, default_value=-1)
    }
    
    example = tf.io.parse_single_example(tfrecord, feature_desc)
    image = tf.io.decode_jpeg(example["image"], channels=1)
    image = tf.reshape(image, shape=[768/2, 1344/2, 1])
    return image, tf.cast(example["label"], tf.float32)

# Get Dataset From GCS Files

Create a tf.dataset from the file names in gcs.

We use multi-threading to speed up processing time.

We also shuffle the dataset to add more variablility. 

We call dataset.map() to parse each example in the file.

In [None]:
def record_dataset(filepaths, shuffle_buffer_size=5000, batch_size=32, training=True, ordered=False):
    
    ignore_order = tf.data.Options()
    
    if not ordered:
        ignore_order.experimental_deterministic = False
    
    dataset = tf.data.TFRecordDataset(filepaths, num_parallel_reads=tf.data.AUTOTUNE)
    
    #dataset = dataset.cache()
    
    dataset = dataset.map(parse_example, num_parallel_calls=tf.data.AUTOTUNE)
    
    if training:
        #dataset = dataset.filter(undersample_majority)
        #dataset = dataset.map(augment_image, num_parallel_calls=tf.data.AUTOTUNE)
        dataset = dataset.with_options(ignore_order)
        #dataset = dataset.shuffle(shuffle_buffer_size)
        dataset = dataset.repeat()
        
    dataset = dataset.batch(batch_size)
    
    dataset = dataset.prefetch(tf.data.AUTOTUNE)
    return dataset

In [None]:
trainpaths = ['gs://' + 'train_batches' + '/' + "{}_batch_{:0>3}.tfrecord".format('train', index) for index in range(0, 200)]
validpaths = ['gs://' + 'train_batches' + '/' + "{}_batch_{:0>3}.tfrecord".format('valid', index) for index in range(0, 50)]

train_set = record_dataset(trainpaths, batch_size=64)
valid_set = record_dataset(validpaths, batch_size=64, training=False)

In [None]:
for i in train_set:
    for k in i[:64]:
        print(k.shape)
    break