## Importing the relevant packagess

In [2]:
import os
import numpy as np
import cv2
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense,Dropout,BatchNormalization,GlobalAveragePooling2D
from tensorflow.keras.regularizers import l2
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.image import ImageDataGenerator, load_img
from tensorflow.keras.callbacks import ReduceLROnPlateau
import scipy.ndimage
from skimage.transform import resize
from PIL import Image
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Dropout, GlobalAveragePooling2D
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import ImageDataGenerator

## Uploading and Preparing the data for training

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
benign_dir = os.path.join('/content/drive/My Drive/BreaKHis_v1/BreaKHis_v1/histology_slides/breast/benign')
malignant_dir = os.path.join('/content/drive/My Drive/BreaKHis_v1/BreaKHis_v1/histology_slides/breast/malignant')

In [None]:
# Define label mapping (0 for benign, 1 for malignant)
benign_label = 0
malignant_label = 1

In [None]:
def collect_files_with_labels(directory, label):
    file_paths = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.png'):  # Only look for .png files
                file_paths.append((os.path.join(root, file), label))  # Store file path and its label
    return file_paths

In [None]:
# Collect benign and malignant files with respective labels
benign_files = collect_files_with_labels(benign_dir, benign_label)
malignant_files = collect_files_with_labels(malignant_dir, malignant_label)
# Count the number of benign and malignant files
num_benign_files = len(benign_files)  # Count of benign images
num_malignant_files = len(malignant_files)  # Count of malignant images

# Print the counts
print(f"Number of benign files: {num_benign_files}")
print(f"Number of malignant files: {num_malignant_files}")

In [None]:
def convert_images_to_structured_arrays(benign_file_paths, malignant_file_paths):
    # Lists to hold image data and labels
    benign_images = []
    malignant_images = []

    # Process benign images
    for file_path, label in benign_file_paths:
        try:
            img = Image.open(file_path).convert('RGB')
            img = img.resize((224, 224))
            img_array = np.array(img)
            benign_images.append(img_array)
        except Exception as e:
            print(f"Error loading benign image {file_path}: {e}")

    # Process malignant images
    for file_path, label in malignant_file_paths:
        try:
            img = Image.open(file_path).convert('RGB')
            img = img.resize((224, 224))
            img_array = np.array(img)
            malignant_images.append(img_array)
        except Exception as e:
            print(f"Error loading malignant image {file_path}: {e}")

    # Create structured arrays for benign and malignant
    benign_structured_array = np.empty(len(benign_images), dtype=[('image', np.uint8, (224, 224, 3)), ('label', np.int32)])
    malignant_structured_array = np.empty(len(malignant_images), dtype=[('image', np.uint8, (224, 224, 3)), ('label', np.int32)])

    # Populate structured arrays
    for i in range(len(benign_images)):
        benign_structured_array[i] = (benign_images[i], benign_label)

    for i in range(len(malignant_images)):
        malignant_structured_array[i] = (malignant_images[i], malignant_label)

    # Combine both structured arrays into one
    combined_structured_array = np.concatenate((benign_structured_array, malignant_structured_array))

    return combined_structured_array

In [None]:
# Convert images to a single structured array
combined_structured_array = convert_images_to_structured_arrays(benign_files, malignant_files)


In [None]:
# Output some statistics
print(f"Total images in combined array: {len(combined_structured_array)}")
print(f"First image shape: {combined_structured_array[0]['image'].shape}")
print(f"First image label: {combined_structured_array[0]['label']}")

In [None]:
# Split the combined structured array into images and labels
images = combined_structured_array['image']  # Extract all images
labels = combined_structured_array['label']  # Extract all labels

## Splitting the data into Train, Validation and Test set

In [None]:
# Separate data for a hold-out test set
X_temp, X_test, y_temp, y_test = train_test_split(images, labels, test_size=0.10, random_state=42)
print(f'Temp set shape: {X_temp.shape}, Test set shape: {X_test.shape}')

In [None]:
# Split the data
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=2/9, random_state=42)
print(f'Training set shape: {X_train.shape}, Validation set shape: {X_val.shape}')

## Data Augmentation

In [None]:
datagen = ImageDataGenerator(
    rotation_range=20,
    zoom_range=0.2,
    horizontal_flip=True,
    vertical_flip=True,
    width_shift_range=0.1,
    height_shift_range=0.1
)

datagen.fit(X_train)

## Baseline Model

In [None]:
# Define number of classes
num_classes = 2  # 0 for benign, 1 for malignant

model_1 = Sequential()

# First Convolutional Layer
model_1.add(Conv2D(16, kernel_size=(3, 3), activation='relu', input_shape=(224, 224, 3)))
model_1.add(MaxPooling2D(pool_size=(2, 2)))

# Second Convolutional Layer
model_1.add(Conv2D(32, kernel_size=(3, 3), activation='relu'))  # Increased filters
model_1.add(MaxPooling2D(pool_size=(2, 2)))


# Flatten Layer
model_1.add(Flatten())

# Fully Connected Layer
model_1.add(Dense(128, activation='relu'))
model_1.add(tf.keras.layers.Dropout(0.5))

# Output Layer
model_1.add(Dense(1, activation='sigmoid'))  # Change to a single neuron with sigmoid activation

# Build the model
model_1.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Print the model summary to see the architecture
model_1.summary()

In [None]:
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=4, restore_best_weights=True)
lr_scheduler = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_accuracy', factor=0.5, patience=2)

history_1 = model_1.fit(
    datagen.flow(X_train, y_train, batch_size=8),  # Augmented batches
    steps_per_epoch=len(X_train) // 8,
    epochs=10,
    validation_data=(X_val, y_val), callbacks = [early_stopping, lr_scheduler]
)

In [None]:
acc_1 = history_1.history['accuracy']
val_acc_1 = history_1.history['val_accuracy']
print("accuracy = ",acc_1)
print("val_accuracy = ",val_acc_1)
loss_1 = history_1.history['loss']
val_loss_1 = history_1.history['val_loss']
print("loss = ",loss_1)
print("val_loss = ",val_loss_1)


In [None]:
epochs_range_1 = range(len(acc_1))

plt.figure(figsize=(8, 8))
plt.subplot(1, 2, 1)
plt.plot(epochs_range_1, acc_1, label='Training Accuracy')
plt.plot(epochs_range_1, val_acc_1, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')

plt.subplot(1, 2, 2)
plt.plot(epochs_range_1, loss_1, label='Training Loss')
plt.plot(epochs_range_1, val_loss_1, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()

## InceptionNetV3 Based Model

In [None]:
from tensorflow.keras.applications import InceptionV3

# Load InceptionV3 with pre-trained ImageNet weights, excluding the top layer
base_model_Inception = InceptionV3(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

# Freeze the base model layers to retain pre-trained weights
base_model_Inception.trainable = False

# Add custom layers
x_2 = base_model_Inception.output
x_2 = GlobalAveragePooling2D()(x_2)  # Global Average Pooling
x_2 = Dense(128, activation='relu')(x_2)  # Fully connected layer
x_2 = Dropout(0.5)(x_2)  # Dropout for regularization
output = Dense(1, activation='sigmoid')(x_2)  # Binary classification output

# Create the final model
model_2 = Model(inputs=base_model_Inception.input, outputs=output)

# Compile the model
model_2.compile(optimizer=Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['accuracy'])

# Print the model summary
model_2.summary()


In [None]:
# Training with early stopping and learning rate reduction
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=3, restore_best_weights=True)
lr_scheduler = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_accuracy', factor=0.5, patience=3)

history_2 = model_2.fit(
    datagen.flow(X_train, y_train, batch_size=8),  # Augmented batches
    steps_per_epoch=len(X_train) // 8,
    epochs=10,
    validation_data=(X_val, y_val), callbacks = [early_stopping, lr_scheduler]
)


In [None]:
acc_2 = history_2.history['accuracy']
val_acc_2 = history_2.history['val_accuracy']
print("accuracy = ",acc_2)
print("val_accuracy = ",val_acc_2)
loss_2 = history_2.history['loss']
val_loss_2 = history_2.history['val_loss']
print("loss = ",loss_2)
print("val_loss = ",val_loss_2)


In [None]:
epochs_range_2 = range(len(acc_2))

plt.figure(figsize=(6, 6))
plt.subplot(1, 2, 1)
plt.plot(epochs_range_2, acc_2, label='Training Accuracy')
plt.plot(epochs_range_2, val_acc_2, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')

plt.subplot(1, 2, 2)
plt.plot(epochs_range_2, loss_2, label='Training Loss')
plt.plot(epochs_range_2, val_loss_2, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()

## ResNet50 Based Model

In [None]:
from tensorflow.keras.applications import ResNet50

# Load ResNet50 with pre-trained ImageNet weights, excluding the top layer
base_model_ResNet = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

# Freeze the base model layers to retain pre-trained weights
base_model_ResNet.trainable = False

# Add custom layers
x_3 = base_model_ResNet.output
x_3 = GlobalAveragePooling2D()(x_3)  # Global Average Pooling
x_3 = Dense(128, activation='relu')(x_3)  # Fully connected layer with 128 units
x_3 = Dropout(0.5)(x_3)  # Regularization with dropout
output_3 = Dense(1, activation='sigmoid')(x_3)  # Output layer for binary classification

# Create the final model
model_3 = Model(inputs=base_model_ResNet.input, outputs=output_3)

# Compile the model
model_3.compile(optimizer=Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['accuracy'])

# Print the model summary
model_3.summary()


In [None]:
# Training with early stopping and learning rate reduction
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=3, restore_best_weights=True)
lr_scheduler = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_accuracy', factor=0.5, patience=3)

history_3 = model_3.fit(
    datagen.flow(X_train, y_train, batch_size=8),  # Augmented batches
    steps_per_epoch=len(X_train) // 8,
    epochs=10,
    validation_data=(X_val, y_val),
    callbacks=[early_stopping, lr_scheduler]
)


In [None]:
acc_3 = history_3.history['accuracy']
val_acc_3 = history_3.history['val_accuracy']
print("accuracy = ",acc_3)
print("val_accuracy = ",val_acc_3)
loss_3 = history_3.history['loss']
val_loss_3 = history_3.history['val_loss']
print("loss = ",loss_3)
print("val_loss = ",val_loss_3)


In [None]:
epochs_range_3 = range(len(acc_3))

plt.figure(figsize=(6, 6))
plt.subplot(1, 2, 1)
plt.plot(epochs_range_3, acc_3, label='Training Accuracy')
plt.plot(epochs_range_3, val_acc_3, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')

plt.subplot(1, 2, 2)
plt.plot(epochs_range_3, loss_3, label='Training Loss')
plt.plot(epochs_range_3, val_loss_3, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()

## EfficientNetB0 Based Model

In [None]:
from tensorflow.keras.applications import EfficientNetB0

# Build EfficientNetB0 model
base_model_EfficientNet = EfficientNetB0(
    include_top=False,
    weights='imagenet',
    input_shape=(224, 224, 3))
base_model_EfficientNet.trainable = False

# Add custom layers
x_4 = base_model_EfficientNet.output
x_4 = GlobalAveragePooling2D()(x_4)  # Global Average Pooling
x_4 = Dense(128, activation='relu')(x_4)  # Fully connected layer with 128 units
x_4 = Dropout(0.5)(x_4)  # Regularization with dropout
output_4 = Dense(1, activation='sigmoid')(x_4)  # Output layer for binary classification

# Create the final model
model_4 = Model(inputs=base_model_EfficientNet.input, outputs=output_4)

# Compile the model
model_4.compile(optimizer=Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['accuracy'])

# Print the model summary
model_4.summary()


In [None]:
# Training with early stopping and learning rate reduction
early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=3, restore_best_weights=True)
lr_scheduler = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_accuracy', factor=0.5, patience=3)

history_4 = model_4.fit(
    datagen.flow(X_train, y_train, batch_size=8),  # Augmented batches
    steps_per_epoch=len(X_train) // 8,
    epochs=10,
    validation_data=(X_val, y_val),
    callbacks=[early_stopping, lr_scheduler]
)


In [None]:
acc_4 = history_4.history['accuracy']
val_acc_4 = history_4.history['val_accuracy']
print("accuracy = ",acc_4)
print("val_accuracy = ",val_acc_4)
loss_4 = history_4.history['loss']
val_loss_4 = history_4.history['val_loss']
print("loss = ",loss_4)
print("val_loss = ",val_loss_4)


In [None]:
epochs_range_4 = range(len(acc_4))

plt.figure(figsize=(6, 6))
plt.subplot(1, 2, 1)
plt.plot(epochs_range_4, acc_4, label='Training Accuracy')
plt.plot(epochs_range_4, val_acc_4, label='Validation Accuracy')
plt.legend(loc='lower right')
plt.title('Training and Validation Accuracy')

plt.subplot(1, 2, 2)
plt.plot(epochs_range_4, loss_4, label='Training Loss')
plt.plot(epochs_range_4, val_loss_4, label='Validation Loss')
plt.legend(loc='upper right')
plt.title('Training and Validation Loss')
plt.show()

## Analysis

Compare all 4 models

In [None]:
results = []

for i, model in enumerate([model_1, model_2, model_3, model_4], start=1):
    loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
    print(f"Model {i}: Test Loss = {loss:.4f}, Test Accuracy = {accuracy:.4f}")
    results.append((f"Model {i}", loss, accuracy))

A table for better comparison

In [None]:
# Print
print("\nEvaluation Results:")
print("{:<10} {:<10} {:<10}".format("Model", "Loss", "Accuracy"))
for name, loss, acc in results:
    print(f"{name:<10} {loss:<10.4f} {acc:<10.4f}")


Confusion Matrices and other scores

In [None]:
def evaluate_model(model, X_test, y_test, model_name="Model"):
    # Predict probabilities
    y_pred_proba = model.predict(X_test)

    # Convert probabilities to class labels
    y_pred = (y_pred_proba > 0.5).astype("int32").flatten()
    y_true = y_test.flatten()

    # Confusion Matrix
    conf_matrix = confusion_matrix(y_true, y_pred)
    disp = ConfusionMatrixDisplay(confusion_matrix=conf_matrix, display_labels=["Class 0", "Class 1"])

    # Plot confusion matrix
    plt.figure(figsize=(5, 4))
    disp.plot(cmap="Blues", values_format='d')
    plt.title(f"{model_name} - Confusion Matrix")
    plt.show()

    # F1 Score
    f1 = f1_score(y_true, y_pred)
    print(f"\n===== {model_name} =====")
    print("F1 Score:", f1)

    # AUC Score
    auc = roc_auc_score(y_true, y_pred_proba)
    print("AUC Score:", auc)

    # Classification Report
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred))

# Evaluate all models with display
evaluate_model(model_1, X_test, y_test, "Model 1")
evaluate_model(model_2, X_test, y_test, "Model 2")
evaluate_model(model_3, X_test, y_test, "Model 3")
evaluate_model(model_4, X_test, y_test, "Model 4")
