In [18]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [9]:
# Global Variables
dir = 'data/cbis-ddsm/'

In [15]:
# Import labels
df = pd.read_csv(f'{dir}mass-labels.csv')
df.tail(1)

Unnamed: 0,identifier,pathology
1695,P_02092_LEFT_MLO_1,2


In [23]:
# Import npy data
img_npys = [np.load(f'{dir}mass-npy/{x}.npy') for x in df.identifier]
img_npys[-1]

array([[ 62.5524203 ,  63.86429184,  52.932029  , ...,  70.86094006,
         63.86429184,  78.7321693 ],
       [ 60.36596773,  64.73887287,  62.98971081, ...,  88.3525606 ,
         74.35926417,  70.42364954],
       [ 42.41717983,  47.68454283,  53.80661002, ...,  96.66108036,
         95.34920882,  88.78985112],
       ...,
       [108.90521475, 108.46792423, 108.03063372, ..., 219.57946839,
        220.45404942, 223.0777925 ],
       [ 96.66108036, 100.59669499, 104.53230961, ..., 218.26759685,
        223.0777925 , 226.57611661],
       [100.59669499,  99.28482345,  87.47797958, ..., 231.38631226,
        227.01340712, 227.01340712]])

In [45]:
# Functions for preprocessing
# Resize Images from import to shape (224,224,3)
def resize(img_npys, size=(224, 224)):
    resized_imgs = [tf.image.resize(np.stack([img] * 1, axis=-1), size).numpy() for img in img_npys]
    return resized_imgs
# Create Dataset from imports, and boolean for data augmentation
def create_dataset(imgs, labels, augment=False):
    # Convert lists of numpy arrays and labels into tf.data.Dataset
    dataset = tf.data.Dataset.from_tensor_slices((imgs, labels))
    # Data augmentation function
    def augment_image(image, label):
        image = tf.image.random_flip_left_right(image)
        image = tf.image.random_flip_up_down(image)
        image = tf.image.rot90(image, k=tf.random.uniform(shape=[], minval=0, maxval=4, dtype=tf.int32))
        image = tf.image.random_brightness(image, max_delta=0.2)
        image = tf.image.random_contrast(image, lower=0.8, upper=1.2)
        return image, label
    # Apply data augmentation to the training dataset only
    if augment: dataset = dataset.map(augment_image, num_parallel_calls=tf.data.AUTOTUNE)
    return dataset

In [46]:
# Resize images
imgs = resize(img_npys)

In [47]:
imgs[0].shape

(224, 224, 1)

In [48]:
# Create the dataset
ds = create_dataset(imgs, df.pathology)

In [49]:
# Shuffle the dataset
ds_rnd = ds.shuffle(buffer_size=len(imgs), seed=42)

In [50]:
# Calculate the sizes of training, validation, and test sets
dim = len(imgs)
dim1 = round(0.1*dim_ttl)
dim8 = dim-2*dim1
print(dim,dim8,dim1)

1696 1356 170


In [51]:
# Split the dataset
ds_train = ds_rnd.take(dim8)
ds_r = ds_rnd.skip(dim8)
ds_val = ds_r.take(dim1)
ds_test = ds_r.skip(dim1)

In [56]:
# Functions to export datasets using the TFRecord format
def serialize_example(image, label):
    image = tf.reshape(image, [-1])
    feature = {
        'image': tf.train.Feature(float_list=tf.train.FloatList(value=image)),
        'label': tf.train.Feature(int64_list=tf.train.Int64List(value=[label]))
    }
    example_proto = tf.train.Example(features=tf.train.Features(feature=feature))
    return example_proto.SerializeToString()
def write_tfrecord(dataset, filename):
    with tf.io.TFRecordWriter(filename) as writer:
        for image, label in dataset:
            serialized_example = serialize_example(image, label)
            writer.write(serialized_example)

In [None]:
# Export Datasets
write_tfrecord(ds_train, 'data/ds_train.tfrecord')
write_tfrecord(ds_val, 'data/ds_val.tfrecord')
write_tfrecord(ds_test, 'data/ds_test.tfrecord')