## **1. Import libraries**
### In this step we will import all the necessary packages

In [None]:
import os
from collections import Counter
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Activation, Flatten, Dense, Conv2D, MaxPooling2D
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import splitfolders
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
import matplotlib.ticker as mticker
from sklearn.utils.class_weight import compute_class_weight
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import roc_curve, auc

## **2. GPU selection**

### Check and use available GPUs

In [None]:
print('GPU name: ', tf.config.experimental.list_physical_devices('GPU'))

In [None]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
            logical_gpus = tf.config.experimental.list_logical_devices('GPU')
            print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        print(e)

## **3. Dataset**

### Import and check the dataset

In [None]:
# Define dataset paths
data_dir = "Malignant_Benign"
train_test_dir = "TrainTest"

In [None]:
categories = ["Malignant", "Benign"]
image_counts = {}

for category in categories:
    category_path = os.path.join(data_dir, category)
    image_counts[category] = len(os.listdir(category_path))

print("Image counts:", image_counts)

In [None]:
plt.figure(figsize=(4, 4))
plt.bar(image_counts.keys(), image_counts.values(), color=['red', 'blue'], width=0.5, alpha=0.5)

ax = plt.gca()  # Get the current axes
for spine in ax.spines.values():
    spine.set_linewidth(2)

#plt.xlabel('Categories', fontweight='bold', fontsize=14)
plt.ylabel('Number of Images', fontweight='bold', fontsize=14)

plt.tick_params(axis='x', which='both', length=5, width=2)
plt.tick_params(axis='y', which='both', length=5, width=2)

plt.xticks(fontsize=12, fontweight='bold')
plt.yticks(fontsize=12, fontweight='bold')
#plt.title('Image Counts by Category', fontweight='bold', fontsize=14)
plt.savefig('image_count.png', dpi=600, bbox_inches='tight', pad_inches=0.1)
plt.show()

## **4. Data imbalance**

### Check if the dataset in imbalanced or not. Use Class wightage for imbalanced data

In [None]:
total_images = sum(image_counts.values())
class_ratios = {cls: count / total_images for cls, count in image_counts.items()}

print("Class Ratios:", class_ratios)

is_imbalanced = max(class_ratios.values()) > 0.6

In [None]:
# Create a pie plot
plt.figure(figsize=(4, 4))  # Set figure size
plt.pie(
    class_ratios.values(),  # Values for the pie chart
    labels=class_ratios.keys(),  # Labels for each section
    autopct='%1.1f%%',  # Display percentages with 1 decimal place
    startangle=90,  # Rotate the pie chart to start from the top
    colors=['red', 'blue'],  # Colors for each section
    explode=(0.1, 0),  # "Explode" the first slice (Cancer) for emphasis
    wedgeprops={'alpha': 0.5}
)

# Add a title
plt.title('Class Distribution', fontsize=14, fontweight='bold')

# Save the plot (optional)
plt.savefig('class_distribution_pie.png', dpi=600, bbox_inches='tight', pad_inches=0.1)

# Show the plot
plt.show()

In [None]:
# If imbalanced, use class weighting
class_weight = None
if is_imbalanced:
    labels = [0] * image_counts["Cancer"] + [1] * image_counts["NonCancer"]
    class_weights = compute_class_weight('balanced', classes=np.unique(labels), y=labels)
    class_weight = {i: weight for i, weight in enumerate(class_weights)}
    print("Using Class Weights:", class_weight)

## **5. Deep learning model construction**

### Data splitting into train and test sets (80:20)

In [None]:
splitfolders.ratio("Cancer_NonCancer", output="TrainTest",
    seed=1337, ratio=(.8, .2), group_prefix=None, move=False) # default values

### Data augmentation

In [None]:
# Create ImageDataGenerators for training and testing
train_datagen = ImageDataGenerator(rescale=1./255, horizontal_flip=True, rotation_range=20)  # Augmentation added
test_datagen = ImageDataGenerator(rescale=1./255)

In [None]:
# Create data loaders
batch_size = 32
img_shape = (116, 116, 3)

train_generator = train_datagen.flow_from_directory(
    os.path.join(train_test_dir, 'train'),
    target_size=(116, 116),
    class_mode='binary',
    batch_size=batch_size,
    shuffle=True
)

test_generator = test_datagen.flow_from_directory(
    os.path.join(train_test_dir, 'val'),
    target_size=(116, 116),
    class_mode='binary',
    batch_size=batch_size,
    shuffle=False
)

### Building of Convolutional Neural Network (CNN) model

In [None]:
# Build CNN model
model = Sequential([
    Conv2D(16, (5, 5), activation='relu', padding='same', input_shape=img_shape),
    MaxPooling2D(pool_size=(2, 2)),
    Conv2D(32, (5, 5), activation='relu', padding='same'),
    MaxPooling2D(pool_size=(2, 2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dense(1, activation='sigmoid')
])

### Complilation of the model

In [None]:
# Compile model with class weighting if dataset is imbalanced
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

### Training of the model

In [None]:
# Train the model
hist = model.fit(train_generator, epochs=50, validation_data=test_generator, class_weight=class_weight)

In [None]:
# Plot accuracy and loss
train_acc = [100 * x for x in hist.history['accuracy']]
test_acc = [100 * x for x in hist.history['val_accuracy']]

### Model accuracy

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(18, 6))

for axis in ax:  # Apply to both subplots
    axis.tick_params(axis='both', which='major', labelsize=14, width=2, length=5)
    for label in axis.get_xticklabels() + axis.get_yticklabels():
        label.set_fontweight('bold')  # Set bold font

# Plot Training & Validation Loss
ax[0].plot(hist.history['loss'], 'g', lw=3, label='Train Loss')
ax[0].plot(hist.history['val_loss'], 'r', lw=3, label='Validation Loss')
ax[0].set_xlabel('Epochs', fontsize=18, fontweight='bold')
ax[0].set_ylabel('Loss', fontsize=18, fontweight='bold')
ax[0].legend(fontsize=15)
ax[0].set_title('Training & Validation Loss', fontsize=15, fontweight='bold')

# Styling Axes
for spine in ax[0].spines.values():
    spine.set_linewidth(2)

# Plot Training & Validation Accuracy
ax[1].plot(train_acc, 'b', lw=3, label=f'Train Accuracy: {train_acc[-1]:.2f}%')
ax[1].plot(test_acc, 'orange', lw=3, label=f'Test Accuracy: {test_acc[-1]:.2f}%')
ax[1].set_xlabel('Epochs', fontsize=18, fontweight='bold')
ax[1].set_ylabel('Accuracy (%)', fontsize=18, fontweight='bold')
ax[1].set_ylim([10, 110])
ax[1].set_title('Training & Validation Accuracy', fontsize=15, fontweight='bold')
ax[1].legend(fontsize=15)

# Styling Axes
for spine in ax[1].spines.values():
    spine.set_linewidth(2)


ax[0].yaxis.set_major_formatter(mticker.FuncFormatter(lambda x, _: f'{x:.1f}'))

plt.savefig('Loss and accuarcy.png', dpi=600, bbox_inches='tight')

plt.show()

### Confusion matrix

In [None]:
# Step 1: Get model predictions
y_pred_probs = model.predict(test_generator)  # Get probability scores
y_pred = (y_pred_probs > 0.5).astype(int).flatten()  # Convert probabilities to binary labels

# Step 2: Get true labels
y_true = test_generator.classes  # Actual class labels from test generator

# Step 3: Compute confusion matrix
cm = confusion_matrix(y_true, y_pred)

# Step 4: Print classification report
print("Classification Report:\n", classification_report(y_true, y_pred, target_names=["Malignant", "Benign"]))

# Step 5: Plot Confusion Matrix
plt.figure(figsize=(6, 5))

# Compute annotation text with both count and percentage
annot = np.array([[f"{value}\n({value/np.sum(row)*100:.1f}%)" if np.sum(row) > 0 else f"{value}\n(0.0%)"
                   for value in row] for row in cm])

sns.heatmap(cm, annot=annot, fmt="", cmap="viridis", xticklabels=["Malignant", "Benign"], 
            yticklabels=["Malignant", "Benign"], annot_kws={"size": 12, "weight": "bold"})

plt.xlabel("Predicted Label", fontsize=12, fontweight="bold")
plt.ylabel("True Label", fontsize=12, fontweight="bold")
plt.title("Confusion Matrix", fontsize=14, fontweight="bold")

plt.savefig('test_cm.png', dpi=600, bbox_inches='tight')

plt.show()

### ROC-AUC analysis on test set

In [None]:
# Get the true labels and predictions
y_true = test_generator.classes  # True labels
y_pred = model.predict(test_generator)  # Predicted probabilities

# If multi-class classification, compute ROC-AUC for each class separately
if y_pred.shape[1] > 1:
    from sklearn.preprocessing import label_binarize
    from sklearn.metrics import roc_auc_score

    y_true_bin = label_binarize(y_true, classes=np.arange(y_pred.shape[1]))
    roc_auc = roc_auc_score(y_true_bin, y_pred, multi_class="ovr")
else:
    # For binary classification
    fpr, tpr, _ = roc_curve(y_true, y_pred)
    roc_auc = auc(fpr, tpr)

    # Plot ROC curve
    plt.figure()
    plt.plot(fpr, tpr, color='blue', lw=2, label=f'ROC curve (area = {roc_auc:.2f})')
    plt.plot([0, 1], [0, 1], color='gray', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic (ROC) Curve')
    plt.legend(loc="lower right")
    plt.show()

print(f"ROC-AUC Score: {roc_auc:.4f}")

## **6. Save the model**

In [None]:
# Save trained model
model.save("cancer_classifier_CNN.keras")