## Breast Cancer Prediction and Detection
### via Machine Learning and Deep Learning  

### David Kinney - DSC680 - Spring 2021 - Professor Catherine Williams
**********************************************************************

In [None]:
# Import packages
import pandas as pd
from pycaret.classification import *

## Breast Cancer Prediction - Machine Learning

In [None]:
# Read the Breast Cancer Wisconsin (Diagnostic) dataset
df = pd.read_csv('./data/data.csv')
df.shape

### Exploratory Data Analysis

In [None]:
df.describe().T

In [None]:
df.info()

In [None]:
df.drop(['Unnamed: 32'], axis=1, inplace=True)

***********************************************
### Machine Learning Model Selection, Training and Tuning  

#### Initialize PyCaret

In [None]:
clf1 = setup(df, target = 'diagnosis', 
             session_id=8784, 
             log_experiment=True, 
             experiment_name='BCpred')

#### Compare Baseline

In [None]:
best_model = compare_models()

**Create best model from baseline results**

In [None]:
model = create_model('xgboost', gpu_id=1)

**Tune hyperparameters**

In [None]:
tuned_model = tune_model(model)

#### Plot Results

In [None]:
plot_model(model)

In [None]:
plot_model(model, plot = 'confusion_matrix')

In [None]:
plot_model(model, plot = 'boundary')

In [None]:
plot_model(model, plot = 'feature')

In [None]:
plot_model(model, plot = 'pr')

In [None]:
plot_model(model, plot = 'class_report')

In [None]:
interpret_model(model)

In [None]:
interpret_model(model, plot = 'correlation')

### Model Prediction

In [None]:
pred_holdouts = predict_model(model)
pred_holdouts.head()

************************************
## Breast Cancer Detection via Deep Learning  

In [None]:
# Import packages
import os

import pandas as pd
import numpy as np

# from fastai import *
# from fastai.data.transforms import get_files, Path
# from fastai.metrics import error_rate
from fastai.vision.transform import get_transforms
from fastai import *
from fastai.vision import *
from fastai.metrics import error_rate
from fast.utils.mem import GPUMemTrace

### Exploratory Data Analysis

In [None]:
"""Our image dataset is stored as .jpg files in 2 different folders, with each 
folder bearing the name of model of the images contained in the folder. """

x  = "./data/images"
path = Path(x)
pattern= r'([^/_]+).png$'
fnames=get_files(path, recurse=True)
tfms=get_transforms(flip_vert=True, max_warp=0., max_zoom=0., max_rotate=0.)
# path.ls()

**Load training data**

In [None]:
np.random.seed(61)
data = ImageDataBunch.from_name_re(path, fnames, pattern, ds_tfms=tfms, size=50, bs=64,num_workers=4
                                  ).normalize()

In [None]:
data

In [None]:
# Class 0 = benign, class 1 = malignant
data.show_batch(rows=3, figsize=(7,6),recompute_scale_factor=True)

In [None]:
patient_ids = listdir(x)
class_0_total = 0
class_1_total = 0

for patient_id in patient_ids:
    class_0_files = listdir(train + patient_id + '/0')
    class_1_files = listdir(train + patient_id + '/1')
    class_0_total += len(class_0_files)
    class_1_total += len(class_1_files) 

total_images = class_0_total + class_1_total
    
print(f'Number of patches in Class 0: {class_0_total}')
print(f'Number of patches in Class 1: {class_1_total}')
print(f'Total number of patches: {total_images}')

In [None]:
sns.set(context='notebook', style='darkgrid', palette='colorblind', font='sans-serif', font_scale=1, rc=None)
matplotlib.rcParams['figure.figsize'] =[8,8]
matplotlib.rcParams.update({'font.size': 15})
matplotlib.rcParams['font.family'] = 'sans-serif'

In [None]:
cancer_perc = data.groupby("patient_id").target.value_counts()/ data.groupby("patient_id").target.size()
cancer_perc = cancer_perc.unstack()

fig, ax = plt.subplots(1,3,figsize=(25,5))

# Plotting Frequency of Patches per Patient
sns.distplot(data.groupby("patient_id").size(), ax=ax[0], color="green", kde=False, bins=20)
ax[0].set_xlabel("Number of patches")
ax[0].set_ylabel("Frequency")
ax[0].set_title("How many patches do we have per patient?")

# Plotting Percentage of an image that is covered by Invasive Ductile Carcinoma
sns.distplot(cancer_perc.loc[:, 1]*100, ax=ax[1], color="red", kde=False, bins=20)
ax[1].set_title("How much percentage of an image is covered by IDC?")
ax[1].set_ylabel("Frequency")
ax[1].set_xlabel("% of patches with IDC")

# Plotting number of patches that show IDC
sns.countplot(data.target, palette='pastel', ax=ax[2]);
ax[2].set_ylabel("Count")
ax[2].set_xlabel("no(0) versus yes(1)")
ax[2].set_title("How many patches show IDC?");

### Create Model

We now use a pre-trained `ResNet18 Convolutional Neural Net` model, and use *transfer learning* to learn weights of only the last layer of the network.   
Why Transfer learning? Because with transfer learning, you begin with an existing (trained) neural network used for image recognition — and then tweak it a bit (or more) here and there to train a model for your particular use case. And why do we do that? Training a reasonable neural network would mean needing approximately 300,000 image samples, and to achieve really good performance, we’re going to need at least a million images.  
In our case, we have approximately 4000+ images in our training set — you have one guess to decide if that would have been enough if were to train a neural net from scratch.  
We use the create_cnn() function for loading a pre-trained ResNet18 network, that was trained on around a million images from the ImageNet database.

In [None]:
learn = cnn_learner(data, models.resnet18, 
                    metrics=[accuracy], 
                    model_dir = Path('./data/working'),
                    path = Path("."))
learn.model = model.cuda()

In [None]:
# calculate and plot the learning rate
learn.lr_find()
learn.recorder.plot(suggestions=True)

### Train and tune a model

In [None]:
lr1 = 1e-3
lr2 = 1e-1
with GPUMemTrace:
    learn.fit_one_cycle(1,slice(lr1,lr2))

In [None]:
# lr1 = 1e-3
lr = 1e-1
with GPUMemTrace:
    learn.fit_one_cycle(1,slice(lr))

In [None]:
# Hypermaramter tuning
learn.unfreeze()
learn.lr_find()
learn.recorder.plot()
with GPUMemTrace:
    learn.fit_one_cycle(1,slice(1e-4,1e-3))

#### Plot a confusion matrix

In [None]:
interp = ClassificationInterpretation.from_learner(learn)
interp.plot_confusion_matrix()

************************************************

In [None]:
# Libraries
import os

import fnmatch
from glob import glob
import numpy as np
import pandas as pd
import cv2
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

import torch 
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import TensorDataset, DataLoader, Dataset

In [None]:
imagePatches = glob('./data/images/**/*.png', recursive=True)
len(imagePatches)

In [None]:
patternZero = '*class0.png'
patternOne = '*class1.png'
classZero = fnmatch.filter(imagePatches, patternZero)
classOne = fnmatch.filter(imagePatches, patternOne)

In [None]:
555048/2

In [None]:
y = []
for img in imagePatches:
    if img in classZero:
        y.append(0)
    elif img in classOne:
        y.append(1)
        
images_df = pd.DataFrame()
images_df["images"] = imagePatches
images_df["labels"] = y
images_df.head()

In [None]:
images_df.groupby("labels")["labels"].count()

In [None]:
#Splitting data into train and val
train, val = train_test_split(images_df, stratify=images_df.labels, test_size=0.2)
len(train), len(val)

In [None]:
class MyDataset(Dataset):
    def __init__(self, df_data,transform=None):
        super().__init__()
        self.df = df_data.values
        
        self.transform = transform

    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        img_path,label = self.df[index]
        
        image = cv2.imread(img_path)
        image = cv2.resize(image, (50,50))
        if self.transform is not None:
            image = self.transform(image)
        return image, label

In [None]:
## Parameters for model

# Hyper parameters
num_epochs = 10
num_classes = 2
batch_size = 128
learning_rate = 0.002

# Device configuration
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [None]:
trans_train = transforms.Compose([transforms.ToPILImage(),
                                  transforms.Pad(64, padding_mode='reflect'),
                                  transforms.RandomHorizontalFlip(), 
                                  transforms.RandomVerticalFlip(),
                                  transforms.RandomRotation(20), 
                                  transforms.ToTensor(),
                                  transforms.Normalize(mean=[0.5, 0.5, 0.5],std=[0.5, 0.5, 0.5])])

trans_valid = transforms.Compose([transforms.ToPILImage(),
                                  transforms.Pad(64, padding_mode='reflect'),
                                  transforms.ToTensor(),
                                  transforms.Normalize(mean=[0.5, 0.5, 0.5],std=[0.5, 0.5, 0.5])])

dataset_train = MyDataset(df_data=train, transform=trans_train)
dataset_valid = MyDataset(df_data=val,transform=trans_valid)

loader_train = DataLoader(dataset = dataset_train, batch_size=batch_size, shuffle=True, num_workers=0)
loader_valid = DataLoader(dataset = dataset_valid, batch_size=batch_size//2, shuffle=False, num_workers=0)

In [None]:
class SimpleCNN(nn.Module):
    def __init__(self):
        # ancestor constructor call
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=32, kernel_size=3, padding=2)
        self.conv2 = nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, padding=2)
        self.conv3 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, padding=2)
        self.conv4 = nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, padding=2)
        self.conv5 = nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, padding=2)
        self.bn1 = nn.BatchNorm2d(32)
        self.bn2 = nn.BatchNorm2d(64)
        self.bn3 = nn.BatchNorm2d(128)
        self.bn4 = nn.BatchNorm2d(256)
        self.bn5 = nn.BatchNorm2d(512)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        self.avg = nn.AvgPool2d(8)
        self.fc = nn.Linear(512 * 1 * 1, 2) # !!!
        
    def forward(self, x):
        x = self.pool(F.leaky_relu(self.bn1(self.conv1(x)))) # first convolutional layer then batchnorm, then activation then pooling layer.
        x = self.pool(F.leaky_relu(self.bn2(self.conv2(x))))
        x = self.pool(F.leaky_relu(self.bn3(self.conv3(x))))
        x = self.pool(F.leaky_relu(self.bn4(self.conv4(x))))
        x = self.pool(F.leaky_relu(self.bn5(self.conv5(x))))
        x = self.avg(x)
        #print(x.shape) # lifehack to find out the correct dimension for the Linear Layer
        x = x.view(-1, 512 * 1 * 1) # !!!
        x = self.fc(x)
        return x

In [None]:
model = SimpleCNN().to(device)

In [None]:
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adamax(model.parameters(), lr=learning_rate)


In [None]:
# Train the model
total_step = len(loader_train)
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(loader_train):
        images = images.to(device)
        labels = labels.to(device)
        
        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        if (i+1) % 100 == 0:
            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
                   .format(epoch+1, num_epochs, i+1, total_step, loss.item()))

In [None]:
# Test the model
model.eval()  # eval mode (batchnorm uses moving mean/variance instead of mini-batch mean/variance)
confusion_matrix = torch.zeros(2, 2)
with torch.no_grad():
    correct = 0
    total = 0
    for images, labels in loader_valid:
        images = images.to(device)
        labels = labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        for t, p in zip(labels.view(-1), predicted.view(-1)):
                confusion_matrix[t.long(), p.long()] += 1
                 
    print('Test Accuracy of the model on the test images: {} %'.format(100 * correct / total))

# Save the model checkpoint
torch.save(model.state_dict(), 'model.ckpt')

In [None]:
print(confusion_matrix)