# Breast Cancer Classification

## Utils

In [2]:
#utils.py
#import os

image_types = (".jpg", ".jpeg", ".png", ".bmp", ".tif", ".tiff")


def list_images(base_path,contains=None):
    # return the set of files that are valid
    return list_files(base_path, valid_exts=image_types, contains=contains)

def list_files(base_path, valid_exts=None, contains=None):
    # loop over the directory structure
    for (root_dir, dir_names, filenames) in os.walk(base_path):
        # loop over the filenames in the current directory
        for filename in filenames:
            # if the contains string is not none and the filename does not contain
            # the supplied string, then ignore the file
            if contains is not None and filename.find(contains) == -1:
                continue

            # determine the file extension of the current file
            ext = filename[filename.rfind("."):].lower()

            # check to see if the file is an image and should be processed
            if valid_exts is None or ext.endswith(valid_exts):
                # construct the path to the image and yield it
                image_path = os.path.join(root_dir, filename)
                yield image_path

In [3]:
#utils.py
import matplotlib
matplotlib.use("Agg")

import matplotlib.pyplot as plt
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

def eval_model(model, model_name, test_gen, steps):
    print("[INFO] evaluating network...")
    test_gen.reset()
    pred_idxs=model.predict(x=test_gen, steps=steps)

    pred_idxs=np.argmax(pred_idxs,axis=1)

    print("CLASSIFICATION REPORT:{}".format(model_name))
    print(classification_report(test_gen.classes,pred_idxs, target_names=test_gen.class_indices.keys()))

    cm=confusion_matrix(test_gen.classes,pred_idxs)
    total=sum(sum(cm))
    acc=(cm[0, 0] + cm[1, 1]) / total
    #sensitivity = cm[0, 0] / (cm[0, 0] + cm[0, 1])
    #specificity = cm[1, 1] / (cm[1, 0] + cm[1, 1])

    print(cm)
    print("acc: {:.4f}".format(acc))
    #print("sensitivity: {:.4f}".format(sensitivity))
    #print("specificity: {:.4f}".format(specificity))


def plot_loss_acc(history,n_epochs,model_name):
    plt.style.use("ggplot")
    plt.figure()
    plt.plot(np.arange(0, n_epochs), history.history["loss"], label="train_loss")
    plt.plot(np.arange(0, n_epochs), history.history["val_loss"], label="val_loss")
    plt.plot(np.arange(0, n_epochs), history.history["accuracy"], label="train_acc")
    plt.plot(np.arange(0, n_epochs), history.history["val_accuracy"], label="val_acc")
    plt.title("Training Loss and Accuracy on Dataset:{}".format(model_name))
    plt.xlabel("Epoch #")
    plt.ylabel("Loss/Accuracy")
    plt.legend(loc="lower left")
    plt.show()

## EDA and Building dataset

In [43]:
#config.py

# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

ORIG_INPUT_DATASET='/kaggle/input/breast-histopathology-images/'

BASE_PATH='/kaggle/working' #nueva dir post train_test_split

TRAIN_PATH=os.path.sep.join([BASE_PATH,"train"])
VAL_PATH=os.path.sep.join([BASE_PATH,"val"])
TEST_PATH=os.path.sep.join([BASE_PATH,"test"])

TRAIN_SPLIT=0.8
VAL_SPLIT=0.1  # % from training dataset



In [44]:
#build_dataset.py
import random
import shutil
#import os

SAMPLES=3000

#select a subset for development
image_paths=list(list_images(ORIG_INPUT_DATASET))
random.seed(42)
random.shuffle(image_paths)
image_paths_subset=random.sample(image_paths,SAMPLES)

#compute the training and testing split
train_index=int(len(image_paths_subset)*TRAIN_SPLIT)
train_paths=image_paths_subset[:train_index]
test_paths=image_paths_subset[train_index:]

#split for validation
val_index=int(len(train_paths)*VAL_SPLIT)
val_paths=train_paths[:val_index]
train_paths=train_paths[val_index:]

#defining datasets
datasets=[
    ("train", train_paths,TRAIN_PATH),
    ("val", val_paths,VAL_PATH),
    ("test", test_paths,TEST_PATH),
]

# method #1: build dataframes with filepaths
train_paths_df=pd.DataFrame(train_paths, columns=["filepath"])
train_labels=[p.split(os.path.sep)[-2] for p in train_paths]
train_paths_df["label"]=pd.Series(train_labels)

val_paths_df=pd.DataFrame(val_paths, columns=["filepath"])
val_labels=[p.split(os.path.sep)[-2] for p in val_paths]
val_paths_df["label"]=pd.Series(val_labels)

test_paths_df=pd.DataFrame(test_paths, columns=["filepath"])
test_labels=[p.split(os.path.sep)[-2] for p in test_paths]
test_paths_df["label"]=pd.Series(test_labels)

# method #2: copy to folder
'''
for (d_type,image_paths,base_output) in datasets:
    print("[INFO] building '{}' split".format(d_type))
    
    if not os.path.exists(base_output):
        print("[INFO] 'creating {}' directory".format(base_output))
        os.makedirs(base_output)
    
    for input_path in image_paths:
        filename=input_path.split(os.path.sep)[-1]
        label=filename[-5:-4]
        
        label_path=os.path.sep.join([base_output,label])
        
        if not os.path.exists(label_path):
            print("[INFO] 'creating {}' directory".format(label_path))
            os.makedirs(label_path)
            
        p=os.path.sep.join([label_path,filename])
        shutil.copy2(input_path,p)
'''

In [45]:
train_paths_df.describe()

In [47]:
from skimage import io
import matplotlib.pyplot as plt
%matplotlib inline

pos_df=train_paths_df[train_paths_df["label"]=='1']
neg_df=train_paths_df[train_paths_df["label"]=='0']

img_pos=io.imread(pos_df["filepath"].iloc[0])
img_neg=io.imread(neg_df["filepath"].iloc[0])

fig=plt.figure()
fig.add_subplot(1,2,1)
plt.title("Positive Sample")
plt.grid(False)
io.imshow(img_pos)
fig.add_subplot(1,2,2)
plt.title("Negative Sample")
plt.grid(False)
io.imshow(img_neg)
plt.show()

In [40]:
print("Number of positive samples in the training set:{}".format(pos_df["label"].count()))
print("Number of negative samples in the training set:{}".format(neg_df["label"].count()))

In [48]:
#train.py

from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.utils import to_categorical
#from cancer_net import CancerNet

import numpy as np
import argparse
import os

# uncomment for terminal usage
ap=argparse.ArgumentParser()
ap.add_argument("-p", "--plot", type=str, default="plot.png",help="path to output loss/accuracy plot")
#args=vars(ap.parse_args())
args={"plot":"/kaggle/temp/plot_loss_acc.png"}

BATCH_SIZE=32

total_train=len(train_paths)
total_val=len(val_paths)
total_test=len(test_paths)

train_labels=[int(l) for l in train_labels]
train_labels=to_categorical(train_labels)
class_totals=train_labels.sum(axis=0)
class_weights=dict()

for i in range(0,len(class_totals)):
    class_weights[i]=class_totals.max()/class_totals[i]
    

# Building the generator to load images dinamically
train_aug=ImageDataGenerator(
    rescale=1/255.0,
    #rotation_range=20,
    #zoom_range=0.05,
    #width_shift_range=0.1,
    #height_shift_range=0.1,
    #shear_range=0.05,
    #horizontal_flip=True,
    #vertical_flip=True,
    #fill_mode="nearest"
)

val_aug=ImageDataGenerator(rescale=1/255.0)

train_gen=train_aug.flow_from_dataframe( #flow_from_directory
    train_paths_df,#TRAIN_PATH,
    directory=None,
    x_col='filepath', y_col='label',
    class_mode="categorical",
    target_size=(48, 48),
    color_mode="rgb",
    shuffle=True,
    batch_size=BATCH_SIZE
)
val_gen = val_aug.flow_from_dataframe( #flow_from_directory
    val_paths_df,#VAL_PATH,
    directory=None,
    x_col='filepath', y_col='label',
    class_mode="categorical",
    target_size=(48, 48),
    color_mode="rgb",
    shuffle=False,
    batch_size=BATCH_SIZE
)

test_gen = val_aug.flow_from_dataframe( #flow_from_directory
    test_paths_df,#TEST_PATH,
    directory=None,
    x_col='filepath', y_col='label',
    class_mode="categorical",
    target_size=(48, 48),
    color_mode="rgb",
    shuffle=False,
    batch_size=BATCH_SIZE
)


## Model building: MODEL #1

### Custom model

In [49]:
#cancer_net.py

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import BatchNormalization, SeparableConv2D, MaxPooling2D, Activation
from tensorflow.keras.layers import Flatten, Dropout, Dense
from tensorflow.keras import backend as K


class CancerNet():
    @staticmethod
    def build(width, height, depth, classes):
        #initialize the model
        model=Sequential()
        input_shape=(height,width,depth)
        channels_dim=-1
        
        if K.image_data_format()=="channels_first":
            input_shape=(depth,height,width)
            channels_dim=1
        
        # CONV => RELU => NORMALIZATION => POOL => DROP
        model.add(SeparableConv2D(32,(3,3),padding="same",input_shape=input_shape))
        model.add(Activation("relu"))
        model.add(BatchNormalization(axis=channels_dim))
        model.add(MaxPooling2D(pool_size=(2,2)))
        model.add(Dropout(0.25))
        
        # (CONV => RELU => NORMALIZATION)*2 => POOL => DROP
        model.add(SeparableConv2D(64,(3,3),padding="same",input_shape=input_shape))
        model.add(Activation("relu"))
        model.add(BatchNormalization(axis=channels_dim))
        model.add(SeparableConv2D(64,(3,3),padding="same",input_shape=input_shape))
        model.add(Activation("relu"))
        model.add(BatchNormalization(axis=channels_dim))
        model.add(MaxPooling2D(pool_size=(2,2)))
        model.add(Dropout(0.25))
        
        # (CONV => RELU => NORMALIZATION)*3 => POOL => DROP
        model.add(SeparableConv2D(128,(3,3),padding="same",input_shape=input_shape))
        model.add(Activation("relu"))
        model.add(BatchNormalization(axis=channels_dim))
        model.add(SeparableConv2D(128,(3,3),padding="same",input_shape=input_shape))
        model.add(Activation("relu"))
        model.add(BatchNormalization(axis=channels_dim))
        model.add(SeparableConv2D(128,(3,3),padding="same",input_shape=input_shape))
        model.add(Activation("relu"))
        model.add(BatchNormalization(axis=channels_dim))
        model.add(MaxPooling2D(pool_size=(2,2)))
        model.add(Dropout(0.25))
        
        #FC HEAD
        model.add(Flatten())
        model.add(Dense(256))
        model.add(Activation("relu"))
        model.add(BatchNormalization())
        model.add(Dropout(0.5))
        
        #softmax classifier
        model.add(Dense(classes))
        model.add(Activation("softmax"))
        
        return model
        

## Experiments:
* Optimizers: Adagrad, Adam, SGD

- **Adagrad**: With a INIT_LR=0.001, I'll make some test to get the limit of the optimizer's update strategy.

- **Adam**: With a INIT_LR=0.001, I'll check if it has a generalization problem with the dataset. Maybe It'll improve with data augmentation.

- **NAG**: Baseline optimizer, just for comparative purposes.

### Adagrad
* INIT_LR:0.001

In [50]:
#train.py
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.optimizers import Adagrad

INIT_LR=1e-2
NUM_EPOCHS=80

# building model #1 adagrad
print("[INFO] Building and compiling the model...")
model_1_adagrad=CancerNet.build(width=48,height=48,depth=3,classes=2)
opt_adagrad=Adagrad(lr=INIT_LR, decay=INIT_LR/NUM_EPOCHS)
model_1_adagrad.compile(loss="binary_crossentropy",optimizer=opt_adagrad,metrics=["accuracy"])

In [51]:
#train.py
print("[INFO] Starting the training session...")

#I wont save with {epoch:02d}.{val_loss:.2f} filepath format
checkpoint=ModelCheckpoint(
    filepath='/kaggle/working/weights.model_1_adagrad.best.hdf5',
    monitor="val_accuracy",
    save_best_only=True
)

h_1_adagrad=model_1_adagrad.fit(
    x=train_gen,
    steps_per_epoch=total_train//BATCH_SIZE,
    validation_data=val_gen,
    validation_steps=total_val//BATCH_SIZE,
    class_weight=class_weights,
    epochs=NUM_EPOCHS,
    callbacks=[checkpoint]
)

### Evaluation:
Evaluating model with Adagrad optimizer.

In [52]:
#eval.py

# loading best model
# building model #1 adagrad
print("[INFO] Building and compiling the model...")
model_1_adagrad=CancerNet.build(width=48,height=48,depth=3,classes=2)
model_1_adagrad.load_weights('/kaggle/working/weights.model_1_adagrad.best.hdf5')
model_1_adagrad.compile(loss="binary_crossentropy",optimizer=opt_adagrad,metrics=["accuracy"])

In [53]:
#eval.py
# model #1 adagrad
eval_model(model_1_adagrad, model_name="model_1_adagrad", test_gen=test_gen, steps=(total_test//BATCH_SIZE)+1)

In [54]:
#eval.py
%matplotlib inline
    
# model #1 adagrad
plot_loss_acc(h_1_adagrad, n_epochs=NUM_EPOCHS,model_name='model_1_adagrad')

## Adam
* INIT_LR:0.001

In [55]:
#train.py
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.optimizers import Adam

INIT_LR=1e-2
NUM_EPOCHS=70

# building model #1 adam
print("[INFO] Building and compiling the model...")
model_1_adam=CancerNet.build(width=48,height=48,depth=3,classes=2)
opt_adam=Adam(lr=INIT_LR)
model_1_adam.compile(loss="binary_crossentropy",optimizer=opt_adam,metrics=["accuracy"])

In [56]:
#train.py
print("[INFO] Starting the training session...")

checkpoint=ModelCheckpoint(
    filepath='/kaggle/working/weights.model_1_adam.best.hdf5',
    monitor="val_accuracy",
)

h_1_adam=model_1_adam.fit(
    x=train_gen,
    steps_per_epoch=total_train//BATCH_SIZE,
    validation_data=val_gen,
    validation_steps=total_val//BATCH_SIZE,
    class_weight=class_weights,
    epochs=NUM_EPOCHS,
    callbacks=[checkpoint]
)

### Evaluation:
Evaluating model with Adam optimizer.

In [57]:
#eval.py

# loading best model
# building model #1 adam
INIT_LR=1e-2
print("[INFO] Building and compiling the model...")
model_1_adam=CancerNet.build(width=48,height=48,depth=3,classes=2)
model_1_adam.load_weights('/kaggle/working/weights.model_1_adam.best.hdf5')
model_1_adam.compile(loss="binary_crossentropy",optimizer=opt_adam,metrics=["accuracy"])

In [58]:
#eval.py

# model #1 adam
eval_model(model_1_adam, model_name="model_1_adam", test_gen=test_gen, steps=(total_test//BATCH_SIZE)+1)

In [59]:
#eval.py
%matplotlib inline
    
# model #1 adam
plot_loss_acc(h_1_adam, n_epochs=NUM_EPOCHS,model_name='model_1_adam')

## NAG
- INIT_LR=0.001

In [60]:
#train.py
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.optimizers import SGD

INIT_LR=1e-2
NUM_EPOCHS=60

# building model #1 adam
print("[INFO] Building and compiling the model...")
model_1_nag=CancerNet.build(width=48,height=48,depth=3,classes=2)
opt_nag=SGD(lr=INIT_LR, nesterov=True)
model_1_nag.compile(loss="binary_crossentropy",optimizer=opt_nag,metrics=["accuracy"])

In [61]:
#train.py
print("[INFO] Starting the training session...")

checkpoint=ModelCheckpoint(
    filepath='/kaggle/working/weights.model_1_nag.best.hdf5',
    monitor="val_accuracy",
)

h_1_nag=model_1_nag.fit(
    x=train_gen,
    steps_per_epoch=total_train//BATCH_SIZE,
    validation_data=val_gen,
    validation_steps=total_val//BATCH_SIZE,
    class_weight=class_weights,
    epochs=NUM_EPOCHS,
    callbacks=[checkpoint]
)

## Evaluation:
Evaluating model with NAG optimizer.

In [62]:
#eval.py

# loading best model
# building model #1 adam
INIT_LR=1e-2
print("[INFO] Building and compiling the model...")
model_1_nag=CancerNet.build(width=48,height=48,depth=3,classes=2)
model_1_nag.load_weights('/kaggle/working/weights.model_1_nag.best.hdf5')
model_1_nag.compile(loss="binary_crossentropy",optimizer=opt_nag,metrics=["accuracy"])

In [63]:
#eval.py

# model #1 adam
eval_model(model_1_nag, model_name="model_1_nag", test_gen=test_gen, steps=(total_test//BATCH_SIZE)+1)

In [64]:
#eval.py
%matplotlib inline
    
# model #1 adam
plot_loss_acc(h_1_nag, n_epochs=NUM_EPOCHS,model_name='model_1_nag')

## It's time to feed one model with more data

I'll try with data augmentation and more data.

In [65]:
#build_dataset.py
import random
import shutil
#import os

SAMPLES=30000

#select a subset for development
image_paths=list(list_images(ORIG_INPUT_DATASET))
random.seed(42)
random.shuffle(image_paths)
image_paths_subset=random.sample(image_paths,SAMPLES)

#compute the training and testing split
train_index=int(len(image_paths_subset)*TRAIN_SPLIT)
train_paths=image_paths_subset[:train_index]
test_paths=image_paths_subset[train_index:]

#split for validation
val_index=int(len(train_paths)*VAL_SPLIT)
val_paths=train_paths[:val_index]
train_paths=train_paths[val_index:]

#defining datasets
datasets=[
    ("train", train_paths,TRAIN_PATH),
    ("val", val_paths,VAL_PATH),
    ("test", test_paths,TEST_PATH),
]

# method #1: build dataframes with filepaths
train_paths_df=pd.DataFrame(train_paths, columns=["filepath"])
train_labels=[p.split(os.path.sep)[-2] for p in train_paths]
train_paths_df["label"]=pd.Series(train_labels)

val_paths_df=pd.DataFrame(val_paths, columns=["filepath"])
val_labels=[p.split(os.path.sep)[-2] for p in val_paths]
val_paths_df["label"]=pd.Series(val_labels)

test_paths_df=pd.DataFrame(test_paths, columns=["filepath"])
test_labels=[p.split(os.path.sep)[-2] for p in test_paths]
test_paths_df["label"]=pd.Series(test_labels)

In [66]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.utils import to_categorical
#from cancer_net import CancerNet

import numpy as np
import argparse
import os

# uncomment for terminal usage
ap=argparse.ArgumentParser()
ap.add_argument("-p", "--plot", type=str, default="plot.png",help="path to output loss/accuracy plot")
#args=vars(ap.parse_args())
args={"plot":"/kaggle/temp/plot_loss_acc.png"}

BATCH_SIZE=32

total_train=len(train_paths)
total_val=len(val_paths)
total_test=len(test_paths)

train_labels=[int(l) for l in train_labels]
train_labels=to_categorical(train_labels)
class_totals=train_labels.sum(axis=0)
class_weights=dict()

for i in range(0,len(class_totals)):
    class_weights[i]=class_totals.max()/class_totals[i]
    

# Building the generator to load images dinamically
train_aug=ImageDataGenerator(
    rescale=1/255.0,
    rotation_range=20,
    zoom_range=0.05,
    width_shift_range=0.1,
    height_shift_range=0.1,
    #shear_range=0.05,
    horizontal_flip=True,
    vertical_flip=True,
    fill_mode="nearest"
)

val_aug=ImageDataGenerator(rescale=1/255.0)

train_gen=train_aug.flow_from_dataframe( #flow_from_directory
    train_paths_df,#TRAIN_PATH,
    directory=None,
    x_col='filepath', y_col='label',
    class_mode="categorical",
    target_size=(48, 48),
    color_mode="rgb",
    shuffle=True,
    batch_size=BATCH_SIZE
)
val_gen = val_aug.flow_from_dataframe( #flow_from_directory
    val_paths_df,#VAL_PATH,
    directory=None,
    x_col='filepath', y_col='label',
    class_mode="categorical",
    target_size=(48, 48),
    color_mode="rgb",
    shuffle=False,
    batch_size=BATCH_SIZE
)

test_gen = val_aug.flow_from_dataframe( #flow_from_directory
    test_paths_df,#TEST_PATH,
    directory=None,
    x_col='filepath', y_col='label',
    class_mode="categorical",
    target_size=(48, 48),
    color_mode="rgb",
    shuffle=False,
    batch_size=BATCH_SIZE
)

### Model: MODEL #1 Adagrad

In [67]:
#train.py
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.optimizers import Adagrad

INIT_LR=1e-2
NUM_EPOCHS=80

# building model #1 adam
print("[INFO] Building and compiling the model...")
model_1_adagrad=CancerNet.build(width=48,height=48,depth=3,classes=2)
opt_adagrad=Adagrad(lr=INIT_LR, decay=INIT_LR/NUM_EPOCHS)
model_1_adagrad.load_weights('/kaggle/working/weights.model_1_nag.best.hdf5')
model_1_adagrad.compile(loss="binary_crossentropy",optimizer=opt_adagrad,metrics=["accuracy"])


In [68]:
#train.py
print("[INFO] Starting the training session...")

#I wont save with {epoch:02d}.{val_loss:.2f} filepath format
checkpoint=ModelCheckpoint(
    filepath='/kaggle/working/weights.model_1_adagrad.retrained.hdf5',
    monitor="val_accuracy",
    save_best_only=True
)

h_1_adagrad=model_1_adagrad.fit(
    x=train_gen,
    steps_per_epoch=total_train//BATCH_SIZE,
    validation_data=val_gen,
    validation_steps=total_val//BATCH_SIZE,
    #class_weight=class_weights,
    epochs=NUM_EPOCHS,
    callbacks=[checkpoint]
)

### Evaluation

In [69]:
#eval.py

# loading best model
# building model #1 adam
INIT_LR=1e-2
print("[INFO] Building and compiling the model...")
model_1_adagrad=CancerNet.build(width=48,height=48,depth=3,classes=2)
model_1_adagrad.load_weights('/kaggle/working/weights.model_1_adagrad.retrained.hdf5')
model_1_adagrad.compile(loss="binary_crossentropy",optimizer=opt_adagrad,metrics=["accuracy"])

In [70]:
#eval.py

# model #1 adam
eval_model(model_1_adagrad, model_name="model_1_adagrad", test_gen=test_gen, steps=(total_test//BATCH_SIZE)+1)

* The model has a strong eye for detecting true negatives.
* Recall isn't enough, we need to have more sensitivity if someone has IDC.

In [71]:
#eval.py
%matplotlib inline
    
# model #1 adam
plot_loss_acc(h_1_adagrad, n_epochs=NUM_EPOCHS,model_name='model_1_adagrad')

Feeding the model with augmented data seems to improve his performance.