In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf

In [2]:
# Global Variables
dir = 'data/cbis-ddsm/'

In [3]:
# Import labels
df = pd.read_csv(f'{dir}mass-labels.csv')
df.tail(1)

Unnamed: 0,identifier,pathology
1695,P_02092_LEFT_MLO_1,2


In [4]:
# Import npy data
img_npys = [np.load(f'{dir}mass-npy/{x}.npy') for x in df.identifier]

In [5]:
# Functions for preprocessing
# Resize Images from import to shape (224,224,3)
def resize(img_npys, size=(224, 224)):
    resized_imgs = [tf.image.resize(np.stack([img] * 3, axis=-1), size).numpy() for img in img_npys]
    return resized_imgs
# Create Dataset from imports, and boolean for data augmentation
def create_dataset(imgs, labels, augment=False):
    # Convert lists of numpy arrays and labels into tf.data.Dataset
    dataset = tf.data.Dataset.from_tensor_slices((imgs, labels))
    # Data augmentation function
    def augment_image(image, label):
        image = tf.image.random_flip_left_right(image)
        image = tf.image.random_flip_up_down(image)
        image = tf.image.rot90(image, k=tf.random.uniform(shape=[], minval=0, maxval=4, dtype=tf.int32))
        image = tf.image.random_brightness(image, max_delta=0.2)
        image = tf.image.random_contrast(image, lower=0.8, upper=1.2)
        return image, label
    # Apply data augmentation to the training dataset only
    if augment: dataset = dataset.map(augment_image, num_parallel_calls=tf.data.AUTOTUNE)
    return dataset

In [6]:
# Resize images
imgs = resize(img_npys)

In [7]:
# Preprocess Images
imgs = tf.keras.applications.densenet.preprocess_input(np.array(imgs))

In [8]:
imgs.shape

(1696, 224, 224, 3)

In [9]:
# Create the dataset
ds = create_dataset([*imgs], df.pathology)

In [10]:
# Shuffle the dataset
ds_rnd = ds.shuffle(buffer_size=len(imgs), seed=42)

In [11]:
# Calculate the sizes of training, validation, and test sets
dim = len(imgs)
dim1 = round(0.1*dim)
dim8 = dim-2*dim1
print(dim,dim8,dim1)

1696 1356 170


In [12]:
# Split the dataset
ds_train = ds_rnd.take(dim8)
ds_r = ds_rnd.skip(dim8)
ds_val = ds_r.take(dim1)
ds_test = ds_r.skip(dim1)

In [13]:
# Batch the datasets for training and evaluation
batch_size = 16  # Reduced batch size for radiology images
ds_train = ds_train.batch(batch_size).prefetch(buffer_size=tf.data.AUTOTUNE)
ds_val = ds_val.batch(batch_size).prefetch(buffer_size=tf.data.AUTOTUNE)
ds_test = ds_test.batch(batch_size).prefetch(buffer_size=tf.data.AUTOTUNE)

In [19]:
for i, (image, label) in enumerate(ds_train.take(5)):
        print(f"Sample {i+1}: Image shape: {image.shape}, Label: {label}")

Sample 1: Image shape: (16, 224, 224, 3), Label: [2 2 0 2 0 2 1 2 2 2 0 0 1 2 2 0]
Sample 2: Image shape: (16, 224, 224, 3), Label: [2 0 2 1 2 2 2 2 0 2 0 2 2 0 2 0]
Sample 3: Image shape: (16, 224, 224, 3), Label: [0 2 0 0 0 0 0 0 0 2 0 0 1 2 0 0]
Sample 4: Image shape: (16, 224, 224, 3), Label: [2 2 0 2 0 0 0 0 0 0 0 2 1 2 2 0]
Sample 5: Image shape: (16, 224, 224, 3), Label: [2 2 2 2 0 2 0 0 2 0 0 1 2 2 2 0]


In [15]:
# Build base model
base_model = tf.keras.applications.DenseNet121(input_shape=(224, 224, 3),
                                               include_top=False,
                                               weights='imagenet')
base_model.trainable = False

In [21]:
# Add custom layers for classification
global_avg_layer = tf.keras.layers.GlobalAveragePooling2D()
dropout_layer = tf.keras.layers.Dropout(0.3)  # Add dropout to prevent overfitting
output_layer = tf.keras.layers.Dense(3, activation='softmax')  # Multi-class classification for labels [0, 1, 2]
# Assemble the model
model = tf.keras.Sequential([
    base_model,
    global_avg_layer,
    dropout_layer,
    output_layer
])

In [22]:
# Compile the model with a lower learning rate
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [23]:
# Train the model
model.fit(ds_train,
          validation_data=ds_val,
          epochs=20,
          callbacks=[debug_callback])

Epoch 1/20
Training batch: 0
Finished batch: 0
[1m 1/85[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m18:35[0m 13s/step - accuracy: 0.5000 - loss: 1.1597Training batch: 1
Finished batch: 1
[1m 2/85[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m22s[0m 273ms/step - accuracy: 0.5312 - loss: 1.1166Training batch: 2
Finished batch: 2
[1m 3/85[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m22s[0m 275ms/step - accuracy: 0.5278 - loss: 1.1120Training batch: 3
Finished batch: 3
[1m 4/85[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m22s[0m 274ms/step - accuracy: 0.5208 - loss: 1.1210Training batch: 4
Finished batch: 4
[1m 5/85[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m21s[0m 272ms/step - accuracy: 0.5217 - loss: 1.1199Training batch: 5
Finished batch: 5
[1m 6/85[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m21s[0m 271ms/step - accuracy: 0.5163 - loss: 1.1252Training batch: 6
Finished batch: 6
[1m 7/85[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m21s[0m 270ms/step - accuracy: 0.5076 - loss: 1.1327Training b

<keras.src.callbacks.history.History at 0x225831fa200>

In [25]:
# Evaluate the model
eval_results = model.evaluate(ds_test)
print(f"Test Loss: {eval_results[0]}, Test Accuracy: {eval_results[1]}")

[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 255ms/step - accuracy: 0.5483 - loss: 0.8997
Test Loss: 0.8910624980926514, Test Accuracy: 0.5176470875740051
