## Breast Cancer Dedection Using CNN

This notebook how deep learning methods, particularly convolutional neural networks (CNNs), can be leveraged to improve the accuracy of breast cancer diagnosis using histopathological images. By optimizing these techniques, we aim to enable earlier detection and minimize the devastating impact of this disease.
Accurate and early diagnosis of breast cancer can significantly improve patient outcomes and reduce the physical and mental toll of the disease.
Globally, breast cancer claims the lives of 670,000 people annually and affects 2.3 million women, underscoring the urgent need for enhanced diagnostic tools.


In [None]:
import tensorflow as tf
from tensorflow.keras import models, layers
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.preprocessing import image
from sklearn.metrics import accuracy_score, confusion_matrix
import seaborn as sns
import os
import PIL
import pandas as pd
import numpy as np
from tensorflow.keras.layers import Rescaling
import glob
import cv2

from sklearn.metrics import classification_report
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv2D,MaxPooling2D, Flatten, Dropout, BatchNormalization
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.optimizers import Adam, SGD
from keras.metrics import binary_crossentropy
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import confusion_matrix
from tensorflow.keras.regularizers import l2
import itertools

## Loading Data

In [None]:
# Create the dataset
dataset = glob.glob('/kaggle/input/breast-histopathology-images/IDC_regular_ps50_idx5/**/*.png',recursive = True)


In [None]:
for img in dataset[:3]:
    print(img)

In [None]:
## The number of images we have

len(dataset)    

In [None]:
# The number of patients
# Folder counts show the number of patients

basepath = "../input/breast-histopathology-images/IDC_regular_ps50_idx5/"
folder_count = os.listdir(basepath)
len(folder_count) 

## Visualization

In [None]:
negative_idc = []
positive_idc = []

for img in dataset:
    if img[-5] == '0' :
        negative_idc.append(img)
    
    elif img[-5] == '1' :
        positive_idc.append(img)
plt.figure(figsize = (15, 15))

some_non = np.random.randint(0, len(negative_idc), 18)
some_can = np.random.randint(0, len(positive_idc), 18)

s = 0
for num in some_non:
    
        img = image.load_img((negative_idc[num]), target_size=(100, 100))
        img = image.img_to_array(img)
        
        plt.subplot(6, 6, 2*s+1)
        plt.axis('off')
        plt.title('no cancer')
        plt.imshow(img.astype('uint8'))
        s += 1
s = 1
for num in some_can:
    
        img = image.load_img((positive_idc[num]), target_size=(100, 100))
        img = image.img_to_array(img)
        
        plt.subplot(6, 6, 2*s)
        plt.axis('off')        
        plt.title('cancer positive')
        plt.imshow(img.astype('uint8'))
        s += 1


In [None]:

print(len(negative_idc))
print(len(positive_idc))

In [None]:
## Categories positive idc or negative idc

num_negative_idc = len(negative_idc)
num_positive_idc = len(positive_idc)


categories = ['Negative IDC', 'Positive IDC']
counts = [num_negative_idc, num_positive_idc]

# Graph
fig, ax = plt.subplots(figsize=(7, 5))
bars = ax.bar(categories, counts, color=['RoyalBlue', 'coral'])


for bar in bars:
    height = bar.get_height()
    ax.text(bar.get_x() + bar.get_width() / 2, height, 
            f'{height}', ha='center', va='bottom')

ax.set_xlabel('IDC Status')
ax.set_ylabel('Number of Patches')
ax.set_title('Number of Negative and Positive IDC Patches')

plt.show()

Since the number of negative idc are more than the number of positive idc shows us imbalanced class problem. This needs to be solved before modelling. 

## Under-Sampling 

In [None]:
## Under-Sampling
## Keeping X and y separate and under-sampling. 
## The number of images are decreased from 277542 to 40057.

total = len(negative_idc) + len(positive_idc)
ppos = len(positive_idc)/total
desired = 40000
psamp = desired/total
print(total, ppos, desired, psamp)

non_img_arr = []
can_img_arr = []
non_y = []
can_y = []

for i,img in enumerate(negative_idc):
    if (np.random.uniform() < psamp):
      n_img = cv2.imread(img, cv2.IMREAD_COLOR)
      n_img = cv2.resize(n_img, (50, 50), interpolation = cv2.INTER_LINEAR)
      non_img_arr.append(n_img)
      non_y.append(0)

for i,img in enumerate(positive_idc):
    if (np.random.uniform() < psamp):
      c_img = cv2.imread(img, cv2.IMREAD_COLOR)
      c_img = cv2.resize(c_img, (50, 50), interpolation = cv2.INTER_LINEAR)
      can_img_arr.append(c_img)
      can_y.append(1)

X = np.concatenate((non_img_arr, can_img_arr))
y = np.concatenate([non_y,can_y])

In [None]:
def describeData(a,b):
    print('Total number of images: {}'.format(len(a)))
    print('Number of IDC(-) Images: {}'.format(np.sum(b==0)))
    print('Number of IDC(+) Images: {}'.format(np.sum(b==1)))
    print('Image shape (Width, Height, Channels): {}'.format(a[0].shape))
describeData(X,y)

## Train-Test Splitting 

In [None]:
## Train-Test Splitting 

from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, y, test_size=0.3)

from tensorflow.keras.utils import to_categorical
Y_train = to_categorical(Y_train, num_classes = 2)
Y_test = to_categorical(Y_test, num_classes = 2)

print("Training Data Shape:", X_train.shape)
print("Testing Data Shape:", X_test.shape)

## Modelling 

In [None]:
early_stop = EarlyStopping(monitor='val_loss', patience=5) #Early Stopping

model = Sequential()
model.add(Conv2D(32, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same', input_shape=(50, 50, 3)))
model.add(BatchNormalization())
model.add(Conv2D(32, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same'))
model.add(MaxPooling2D((2, 2)))
model.add(BatchNormalization())
model.add(Dropout(0.3))
model.add(Conv2D(64, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same'))
model.add(BatchNormalization())
model.add(Conv2D(64, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same'))
model.add(BatchNormalization())
model.add(MaxPooling2D((2, 2)))
model.add(Dropout(0.3))
model.add(Conv2D(128, (3, 3), activation='relu', kernel_initializer='he_uniform', padding='same'))
model.add(Flatten())
model.add(Dense(128, activation='relu', kernel_initializer='he_uniform', kernel_regularizer=l2(0.01)))
model.add(BatchNormalization())
model.add(Dense(64, activation='relu', kernel_initializer='he_uniform', kernel_regularizer=l2(0.01)))
model.add(BatchNormalization())
model.add(Dense(64, activation='relu', kernel_initializer='he_uniform', kernel_regularizer=l2(0.01)))
model.add(Dropout(0.3))
model.add(Dense(24, activation='relu', kernel_initializer='he_uniform', kernel_regularizer=l2(0.01)))
model.add(Dense(2, activation='softmax'))

In [None]:
model.compile(Adam(learning_rate=0.0001), loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
# Model Training

history = model.fit(X_train, Y_train, validation_data=(X_test, Y_test), epochs=10, batch_size=35)

## Model Evaluation 

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model Accuracy')
plt.xlabel('epoch')
plt.ylabel('accuracy')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.xlabel('epoch')
plt.ylabel('loss')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
# Predicted Values
Y_pred = model.predict(X_test)
Y_pred_classes = np.argmax(Y_pred, axis=1)
Y_true = np.argmax(Y_test, axis=1)

# Confusion matrix 
confusion_mtx = confusion_matrix(Y_true, Y_pred_classes)

# Confusion matrix visualization
f, ax = plt.subplots(figsize=(6, 4))
sns.heatmap(confusion_mtx, annot=True, linewidths=0.01, cmap="coolwarm", linecolor="gray", fmt='.1f', ax=ax)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.show()

In [None]:
model.evaluate(X_test,Y_test)

## Checking

In [None]:
def img_plot(arr,index=0):
    plt.title('Test Image')
    plt.imshow(arr[index])
    
index = 1
img_plot(X_test, index)

In [None]:
def img_plot(arr,index=0):
    plt.title('Test Image')
    plt.imshow(arr[index])
index = 1
input = X_test[index:index+1]
pred = model.predict(input)[0].argmax()
label = Y_test[index].argmax()
print('Predicted Value using  cnn model',pred)
print("True Value",label)

The application of deep learning methods is a challenging task from understanding the dataset to preparing and modelling it. In this notebook, the stages of this challenging tasks were carried out using the Breast Histopathology Images dataset. The dataset was first visualized and positive and negative patches were categorized. Subsequently, the under-sampling method was used to solve the imbalanced class problem. Finally, a complicated model was built based on the CNN method, the training of the model was completed and the model was tested. Although the results obtained are successful, different results may be obtained in the future with different deep learning models or hybrid models, different parameter values, different epoch numbers for the same dataset. 