# 1. ABOUT DATA AND PROJECT**

In [1]:
**Goal :** Devlop a model that helps in identifying metastatic cancer from small image patches.A positive label indicates that the center 32x32px region of a patch contains at least one pixel of tumor tissue. 
​
**Size of the dataset **: 
​
Training set - 220025
Test set - 57458

SyntaxError: invalid syntax (2283467727.py, line 1)

# 2. EDA <a class="anchor" id="prep"></a>



In [None]:
import numpy as np
import pandas as pd
import os
from os import listdir
import cv2

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set()
from PIL import Image
from glob import glob
from skimage.io import imread

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

import torch 
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import TensorDataset, DataLoader, Dataset
import torch.optim as optim

import time
import copy
from tqdm import tqdm_notebook as tqdm

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)
warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)



In [None]:
# Model Parameters
num_epochs = 10
batch_size = 128
num_classes = 2
learning_rate = 0.002

# Device configuration
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

### 2.1 Loading the data structure

In [None]:
base_dir = '../input/histopathologic-cancer-detection/'
print(os.listdir(base_dir))

In [None]:
labels = pd.read_csv(base_dir + "train_labels.csv")
labels.head()

In [None]:
labels.shape

In [None]:
labels.info()

This file contains the ids of images for training and their labels for cancer. 

In [None]:
train_path = base_dir + "train/"
test_path = base_dir + "test/"
train_files = listdir(train_path)
test_files = listdir(test_path)

In [None]:
train_files[:5]

In [None]:
test_files[:5]

In [None]:
# Number of images in train and test
print("Train size: ", len(train_files))
print("Test size: ", len(test_files))

In [None]:
print((len(train_files)/(len(train_files)+len(test_files)))*100, (len(test_files)/(len(train_files)+len(test_files)))*100)

The directories train and test contain the actual images with 79.3% and 20.7% of the total images respectively.

In [None]:
sub = pd.read_csv(base_dir + "sample_submission.csv")
sub.head()

In [None]:
sub.shape

In [None]:
sub.info()

**2.2 ANALYSIS ON THE DATA SET**

Here we find that there are more number of class 0, that is more samples for the images that are true for showing the presence of cancer. Thus we can see that there is class imbalance. Let us plot and have a look at the same.

In [None]:
plt.pie(labels.label.value_counts(), labels=['No Cancer', 'Cancer'], colors=['#90EE91', '#F47174'], autopct='%1.1f')
plt.show()

 **2.3 Visualizing healthy and cancer patches****

In [None]:
positive_images = np.random.choice(labels[labels.label==1].id, size=50, replace=False)
negative_images = np.random.choice(labels[labels.label==0].id, size=50, replace=False)

**Cancer patches**

In [None]:
fig, ax = plt.subplots(5, 10, figsize=(20,10))

for n in range(5):
    for m in range(10):
        img_id = positive_images[m + n*10]
        image = Image.open(train_path + img_id + ".tif")
        ax[n,m].imshow(image)
        ax[n,m].grid(False)
        ax[n,m].tick_params(labelbottom=False, labelleft=False)

**Healthy patches**

In [None]:
fig, ax = plt.subplots(5, 10, figsize=(20,10))

for n in range(5):
    for m in range(10):
        img_id = negative_images[m + n*10]
        image = Image.open(train_path + img_id + ".tif")
        ax[n,m].imshow(image)
        ax[n,m].grid(False)
        ax[n,m].tick_params(labelbottom=False, labelleft=False)

**SOLUTION TO DATA IMBALANCE**

Thus to balance the dataset ,we are going to add more number of images to the imbalanced class by data augmentation and increase the data size.

# #3. Data Preprocessing <a class="anchor" id="data"></a>**



# #3.1 Splitting the data into train and validation sets **

In [None]:
train, val = train_test_split(labels, stratify=labels.label, test_size=0.1)
print(len(train), len(val))

I have split the train data into train and validation sets in the ratio 9:1.

**Plotting the positive and negative ratio in train and val sets**

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(10,4))

sns.countplot(train.label, palette="Blues", ax=ax[0])
ax[0].set_title("Train dataset")
for i, rows in enumerate(train['label'].value_counts().values):
    ax[0].annotate(int(rows), xy=(i, rows), ha='center')
sns.countplot(val.label, palette="Greens", ax=ax[1])
ax[1].set_title("Validation dataset")
for i, rows in enumerate(val['label'].value_counts().values):
    ax[1].annotate(int(rows), xy=(i, rows), ha='center')

### 3.2 Custom Dataset

I have created a dataset that loads an image patch, converts it to RGB, performs the augmentation if it's desired, and returns the image and its label.

In [None]:
class CancerDataset(Dataset):
    
    def __init__(self, df_data, data_dir = './', transform=None):
        super().__init__()
        self.df = df_data.values
        self.data_dir = data_dir
        self.transform = transform
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        img_name,label = self.df[index]
        img_path = os.path.join(self.data_dir, img_name + '.tif')
        image = cv2.imread(img_path)
        if self.transform is not None:
            image = self.transform(image)
        return image, label

### 3.3 Data Augmentation

Now to increase the data size, I have applied transformation like flipping and rotation to the train dataset, and then converted the datasets into tensors.

In [None]:
transform_train = transforms.Compose([transforms.ToPILImage(),
                                  transforms.RandomHorizontalFlip(), 
                                  transforms.RandomVerticalFlip(),
                                  transforms.RandomRotation(20), 
                                  transforms.ToTensor(),
                                  transforms.Normalize(mean=[0.5, 0.5, 0.5],std=[0.5, 0.5, 0.5])])

transform_val = transforms.Compose([transforms.ToPILImage(),
                                  transforms.ToTensor(),
                                  transforms.Normalize(mean=[0.5, 0.5, 0.5],std=[0.5, 0.5, 0.5])])

transform_test = transforms.Compose([transforms.ToPILImage(), 
                                  transforms.ToTensor(),
                                  transforms.Normalize(mean=[0.5, 0.5, 0.5],std=[0.5, 0.5, 0.5])])

In [None]:
train_dataset = CancerDataset(df_data=train, data_dir=train_path, transform=transform_train)
val_dataset = CancerDataset(df_data=val, data_dir=train_path, transform=transform_val)
test_dataset = CancerDataset(df_data=sub, data_dir=test_path, transform=transform_test)

In [None]:
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, drop_last=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [None]:
print(len(train_dataloader), len(val_dataloader), len(test_dataloader))

# 4. Defining the Model <a class="anchor" id="model"></a>

I am using a CNN as the model with 5 layers.

In [None]:
class CNN(nn.Module):
    def __init__(self):
        super(CNN,self).__init__()
        
        self.conv1 = nn.Sequential(
                        nn.Conv2d(3, 32, 3, stride=1, padding=1),
                        nn.BatchNorm2d(32),
                        nn.ReLU(inplace=True),
                        nn.MaxPool2d(2,2))
        
        self.conv2 = nn.Sequential(
                        nn.Conv2d(32, 64, 3, stride=1, padding=1),
                        nn.BatchNorm2d(64),
                        nn.ReLU(inplace=True),
                        nn.MaxPool2d(2,2))
        
        self.conv3 = nn.Sequential(
                        nn.Conv2d(64, 128, 3, stride=1, padding=1),
                        nn.BatchNorm2d(128),
                        nn.ReLU(inplace=True),
                        nn.MaxPool2d(2,2))
        
        self.conv4 = nn.Sequential(
                        nn.Conv2d(128, 256, 3, stride=1, padding=1),
                        nn.BatchNorm2d(256),
                        nn.ReLU(inplace=True),
                        nn.MaxPool2d(2,2))
        
        self.conv5 = nn.Sequential(
                        nn.Conv2d(256, 512, 3, stride=1, padding=1),
                        nn.BatchNorm2d(512),
                        nn.ReLU(inplace=True),
                        nn.MaxPool2d(2,2))
        
        self.fc=nn.Sequential(
                nn.Linear(512*3*3, 256),
                nn.ReLU(inplace=True),
                nn.BatchNorm1d(256),
                nn.Dropout(0.4),
                nn.Linear(256, num_classes))
        
    def forward(self,x):
        x=self.conv1(x)
        x=self.conv2(x)
        x=self.conv3(x)
        x=self.conv4(x)
        x=self.conv5(x)
#        print(x.shape)
        x=x.view(x.shape[0],-1)
        x=self.fc(x)
        return x

Printing the training model.

In [None]:
model = CNN().to(device)
print(model)

### 4.1  Loss and Optimizer

This task is a binary classification problem that has two classes, 1 for cancer positive images and 0 for cancer negative images. For loss function, I have used cross entropy loss.
I have used adam for optimizer.

In [None]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adamax(model.parameters(), lr=learning_rate)

# 5. Training the Model <a class="anchor" id="train"></a>

Building the training loop for the model. It prints the loss and accuracy for training and validation after each epoch.
For accuracy, I have calculated the area under the ROC curve between the predicted probability and the observed target.
The losses and accuracies are also saved in an array for further evaluation of the model.

In [None]:
train_losses = []
val_losses = []
train_auc = []
val_auc = []
train_auc_epoch = []
val_auc_epoch = []
best_acc = 0.0
min_loss = np.Inf

since = time.time()

for e in range(num_epochs):
    
    train_loss = 0.0
    val_loss = 0.0
    
    # Train the model
    model.train()
    for i, (images, labels) in enumerate(tqdm(train_dataloader, total=int(len(train_dataloader)))):
        images = images.to(device)
        labels = labels.to(device)
        
        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # Loss and accuracy
        train_loss += loss.item()
        y_actual = labels.data.cpu().numpy()
        y_pred = outputs[:,-1].detach().cpu().numpy()
        train_auc.append(roc_auc_score(y_actual, y_pred))
    
    # Evaluate the model
    model.eval()
    for i, (images, labels) in enumerate(tqdm(val_dataloader, total=int(len(val_dataloader)))):
        images = images.to(device)
        labels = labels.to(device)
        
        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)
        
        # Loss and accuracy
        val_loss += loss.item()
        y_actual = labels.data.cpu().numpy()
        y_pred = outputs[:,-1].detach().cpu().numpy()
        val_auc.append(roc_auc_score(y_actual, y_pred))
    
    # Average losses and accuracies
    train_loss = train_loss/len(train_dataloader)
    val_loss = val_loss/len(val_dataloader)
    train_losses.append(train_loss)
    val_losses.append(val_loss)
    training_auc = np.mean(train_auc)
    validation_auc = np.mean(val_auc)
    train_auc_epoch.append(training_auc)
    val_auc_epoch.append(validation_auc)
    
    # Updating best validation accuracy
    if best_acc < validation_auc:
        best_acc = validation_auc
        
    # Saving best model
    if min_loss >= val_loss:
        torch.save(model.state_dict(), 'best_model.pt')
        min_loss = val_loss
    
    print('EPOCH {}/{}'.format(e+1, num_epochs))
    print('-' * 10)
    print("Train loss: {:.6f}, Train AUC: {:.4f}".format(train_loss, training_auc))
    print("Validation loss: {:.6f}, Validation AUC: {:.4f}\n".format(val_loss, validation_auc))

time_elapsed = time.time() - since
print('Training completed in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
print('Best validation accuracy: {:4f}'.format(best_acc))

# 5.1 Model with different loss and regularization**

Let us try another model with differet loss say "sparse categorical entropy" and a different regularization method i.e. instead of max pooling , let us try adding a drop out layer

In [None]:
model1 = Sequential()



# Conv1

model1.add(Conv2D(32, (3,3), input_shape=(96,96,3),padding='same'))

model1.add(Activation('relu'))

model1.add(MaxPool2D((2,2)))
model1.add(Dropout(0.3))


# Conv2

model1.add(Conv2D(64, (3,3),padding='valid'))

model1.add(Activation('relu'))

model1.add(MaxPool2D((2,2)))
model1.add(Dropout(0.3))

# Conv3

model1.add(Conv2D(128, (3,3),padding='valid'))

model1.add(Activation('relu'))

model1.add(MaxPool2D((2,2)))
model1.add(Dropout(0.3))

# Conv4

model1.add(Conv2D(256, (3,3),padding='valid'))

model1.add(Activation('relu'))

model1.add(MaxPool2D((2,2)))
model1.add(Dropout(0.3))

# Conv5
model1.add(Conv2D(512, (3,3),padding='valid'))

model1.add(Activation('relu'))

model1.add(MaxPool2D((2,2)))
model1.add(Dropout(0.3))

# FC

model1.add(Flatten())
model1.add(Dense(512, activation='sigmoid'))
model1.add(Dropout(0.3))

#op
model1.add(Dropout(0.3))
model1.add(Dense(2))
model1.add(Activation('sigmoid'))

model1.summary()

In [None]:
model1.compile(loss='sparse_categorical_crossentropy',           
              optimizer='adam', metrics=['accuracy'])

In [None]:
STEP_SIZE_TRAIN=train_dataloader.n//train_dataloader.batch_size
STEP_SIZE_VALID=valid_dataloader.n//valid_dataloader.batch_size

In [None]:
history1 = model1.fit(train_generator,
                    steps_per_epoch=STEP_SIZE_TRAIN,
                    validation_data=valid_generator,
                    validation_steps=STEP_SIZE_VALID,
                    epochs=30, verbose=1
)

### 5.3 COMPARISON OF THE TWO MODELS ****

Our first model with 10 epochs seems to perform better when compared to the second one which has :sparse categorical loss function and no learning rate and 30 epochs. This may be due to the loss function and the effect of the learning rate.

### 5.4 Plotting training history

**Loss Convergence**

In [None]:
plt.figure(figsize=(20,5))
plt.plot(train_losses, '-o', label="train")
plt.plot(val_losses, '-o', label="val")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Loss change over epoch")
plt.legend()

## 5.5 Accuracy trend**

In [None]:
plt.figure(figsize=(20,5))
plt.plot(train_auc_epoch, '-o', label="train")
plt.plot(val_auc_epoch, '-o', label="val")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.title("Accuracy over epoch")
plt.legend()

## 5.6 Loading the best model

In [None]:
model.load_state_dict(torch.load('best_model.pt'))

# 6. Model predictions <a class="anchor" id="pred"></a>

### 6.1 Predictions on test dataset

I have used my best model to make predictions on the test dataset.

In [None]:
model.eval()

predictions = []

for i, (images, labels) in enumerate(tqdm(test_dataloader, total=int(len(test_dataloader)))):
    images = images.to(device)
    labels = labels.to(device)
    
    outputs = model(images)
    pred = outputs[:,1].detach().cpu().numpy()
    
    for j in pred:
        predictions.append(j)

### 6.2  Modifying the submission file

Now I am using the predictions made by the model to create a submission file.

In [None]:
sub['label'] = predictions
sub.to_csv('submission.csv', index=False)
sub.info()

# 7. Visualising predictions



In [None]:
test_images = np.random.choice(sub.id, size=50, replace=False)     

fig, ax = plt.subplots(5, 10, figsize=(20,10))

for n in range(5):
    for m in range(10):
        img_id = test_images[m + n*10]
        image = Image.open(test_path + img_id + ".tif")
        pred = sub.loc[sub['id'] == img_id, 'label'].values[0]
        label = "Cancer" if(pred >= 0.5) else "Healthy"  
        ax[n,m].imshow(image)
        ax[n,m].grid(False)
        ax[n,m].tick_params(labelbottom=False, labelleft=False)
        ax[n,m].set_title("Label: " + label)

# 8.FUTURE ENHANCEMENTS**

The model can be further improved by increasing the number of epochs or changing the hyper parameters like learning rate, drop out etc.