In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

#import numpy as np # linear algebra
#import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

#import os
#for dirname, _, filenames in os.walk('/kaggle/input'):
   # for filename in filenames:
        #print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Goal :** Devlop a model that helps in identifying metastatic cancer from small image patches.A positive label indicates that the center 32x32px region of a patch contains at least one pixel of tumor tissue. 

**Size of the dataset **: 

Training set - 220025
Test set - 57458

**EXPLORATORY DATA ANALYSIS**

**1. Determine if there are imbalance classes in the training set so that we can make them balanced.**

In [2]:
##### Load the training set ##############
import numpy as np
import pandas as pd

train_labels = pd.read_csv('../input/histopathologic-cancer-detection/train_labels.csv', dtype=str)
print(train_labels.shape)

In [4]:
train_labels['label'].value_counts()

Here we find that there are more number of class 0, that is more samples for the images that are true for showing the presence of cancer. Thus we can see that there is class imbalance. Let us plot and have a look at the same.

In [5]:
train_labels['label'].value_counts().plot(kind='bar')

To balance the classes, let us remove some data from the positive class ("0" class)

In [7]:
######### Convert the "label" from object to int data type #######
train_labels['label'] = train_labels['label'].astype(int)

In [None]:
#tr=train_labels[(train_labels['label'] ==1 ) ]
#print(">>",tr)

In [8]:
train_labels_pos = train_labels[train_labels['label']==1]
print(train_labels_pos.shape[0])
train_labels_neg = train_labels[train_labels['label']==0]
train_labels_neg = train_labels_neg.sample(n = train_labels_pos.shape[0])

In [9]:
print(train_labels_neg.shape[0])
print(train_labels_pos.shape[0])

In [10]:
train_labels_full = pd.concat([train_labels_neg,train_labels_pos]).sample(frac=1, random_state=12345).reset_index(drop=True)
train_labels_full.head()

**2. Plot the cancer and non-cancer images**

In [11]:
pos_images = np.random.choice(train_labels_full[train_labels_full.label==1].id, size=50, replace=False)
print(type(pos_images))

In [12]:
import matplotlib.pyplot as plt
from PIL import Image
fig, ax = plt.subplots(5, 10, figsize=(20,10))

for n in range(5):
    for m in range(10):
        img_id = pos_images[m + n*10]
        image = Image.open(f'../input/histopathologic-cancer-detection/train/' + img_id + ".tif")
        ax[n,m].imshow(image)
        ax[n,m].grid(False)
        ax[n,m].tick_params(labelbottom=False, labelleft=False)

In [13]:
neg_images = np.random.choice(train_labels_full[train_labels_full.label==0].id, size=50, replace=False)
print(type(pos_images))

In [14]:
fig, ax = plt.subplots(5, 10, figsize=(20,10))

for n in range(5):
    for m in range(10):
        img_id = neg_images[m + n*10]
        image = Image.open(f'../input/histopathologic-cancer-detection/train/' + img_id + ".tif")
        ax[n,m].imshow(image)
        ax[n,m].grid(False)
        ax[n,m].tick_params(labelbottom=False, labelleft=False)

**DATA PREPORCESSING**

Let us split the data into training and validation sets

In [15]:
from sklearn.model_selection import train_test_split

In [16]:
train_df, valid_df = train_test_split(train_labels_full, test_size=0.25, random_state=1234, 
                                      stratify=train_labels_full.label)

In [17]:
train_df['id'] = train_df['id']+'.tif'
valid_df['id'] = valid_df['id']+'.tif'

In [18]:
train_df['label'] = train_df['label'].astype(str)
valid_df['label'] = valid_df['label'].astype(str)

In [19]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow import keras
from keras.layers import Dense, Activation, Flatten, Dropout, BatchNormalization
from keras.layers import Conv2D, MaxPooling2D,MaxPool2D
from keras import regularizers, optimizers
from keras.layers import PReLU
from keras.initializers import Constant
from tensorflow.keras.preprocessing.image import ImageDataGenerator

In [20]:
#create the training and validation subsets
train_datagen=ImageDataGenerator(rescale=1/255)

train_generator=train_datagen.flow_from_dataframe(dataframe=train_df,directory="../input/histopathologic-cancer-detection/train/",
                x_col="id",y_col="label",batch_size=64,seed=1234,shuffle=True,
                class_mode="binary",target_size=(96,96))

valid_generator=train_datagen.flow_from_dataframe(dataframe=valid_df,directory="../input/histopathologic-cancer-detection/train/",
                x_col="id",y_col="label",batch_size=64,seed=1234,shuffle=True,
                class_mode="binary",target_size=(96,96))

In [None]:
model = Sequential()



# Conv1

model.add(Conv2D(32, (3,3), input_shape=(96,96,3),padding='same'))

model.add(Activation('relu'))

model.add(MaxPool2D((2,2)))



# Conv2

model.add(Conv2D(64, (3,3),padding='valid'))

model.add(Activation('relu'))

model.add(MaxPool2D((2,2)))


# Conv3

model.add(Conv2D(128, (3,3),padding='valid'))

model.add(Activation('relu'))

model.add(MaxPool2D((2,2)))


# Conv4

model.add(Conv2D(256, (3,3),padding='valid'))

model.add(Activation('relu'))

model.add(MaxPool2D((2,2)))


# Conv5
model.add(Conv2D(512, (3,3),padding='valid'))

model.add(Activation('relu'))

model.add(MaxPool2D((2,2)))


# FC

model.add(Flatten())

model.add(Dense(512, activation='sigmoid'))


#op
model.add(Dropout(0.3))
model.add(Dense(2))

model.add(Activation('sigmoid'))

model.summary()







In [None]:
model.compile(loss='sparse_categorical_crossentropy',           
              optimizer='adam', metrics=['accuracy'])

In [21]:
STEP_SIZE_TRAIN=train_generator.n//train_generator.batch_size
STEP_SIZE_VALID=valid_generator.n//valid_generator.batch_size

In [None]:


history = model.fit(train_generator,
                    steps_per_epoch=STEP_SIZE_TRAIN,
                    validation_data=valid_generator,
                    validation_steps=STEP_SIZE_VALID,
                    epochs=30, verbose=1
)

In [None]:
import gc
gc.collect()
plt.figure(figsize=(20,5))
plt.plot(history.history['accuracy'],label="train")
plt.plot(history.history['val_accuracy'],label="Validation")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.title("Accuracy change over epoch")
plt.legend()

In [None]:
import gc
gc.collect()
plt.figure(figsize=(20,5))
plt.plot(history.history['loss'],label="train")
plt.plot(history.history['val_loss'],label="Validation")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Loss change over epoch")
plt.legend()

From the above curve, we can see that our initial model is overfitting; Let us now create another model by adding dropout at each layer.

In [22]:
model1 = Sequential()



# Conv1

model1.add(Conv2D(32, (3,3), input_shape=(96,96,3),padding='same'))

model1.add(Activation('relu'))

model1.add(MaxPool2D((2,2)))
model1.add(Dropout(0.3))


# Conv2

model1.add(Conv2D(64, (3,3),padding='valid'))

model1.add(Activation('relu'))

model1.add(MaxPool2D((2,2)))
model1.add(Dropout(0.3))

# Conv3

model1.add(Conv2D(128, (3,3),padding='valid'))

model1.add(Activation('relu'))

model1.add(MaxPool2D((2,2)))
model1.add(Dropout(0.3))

# Conv4

model1.add(Conv2D(256, (3,3),padding='valid'))

model1.add(Activation('relu'))

model1.add(MaxPool2D((2,2)))
model1.add(Dropout(0.3))

# Conv5
model1.add(Conv2D(512, (3,3),padding='valid'))

model1.add(Activation('relu'))

model1.add(MaxPool2D((2,2)))
model1.add(Dropout(0.3))

# FC

model1.add(Flatten())
model1.add(Dense(512, activation='sigmoid'))
model1.add(Dropout(0.3))

#op
model1.add(Dropout(0.3))
model1.add(Dense(2))
model1.add(Activation('sigmoid'))

model1.summary()







In [23]:
model1.compile(loss='sparse_categorical_crossentropy',           
              optimizer='adam', metrics=['accuracy'])

In [None]:
history1 = model1.fit(train_generator,
                    steps_per_epoch=STEP_SIZE_TRAIN,
                    validation_data=valid_generator,
                    validation_steps=STEP_SIZE_VALID,
                    epochs=30, verbose=1
)

In [None]:
import gc
gc.collect()
plt.figure(figsize=(20,5))
plt.plot(history1.history['loss'],'-o',label="train")
plt.plot(history1.history['val_loss'],'-o',label="Validation")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Loss change over epoch for Model1[Dropout at each layer]")
plt.legend()
plt.grid()

In [None]:
import gc
gc.collect()
plt.figure(figsize=(20,5))
plt.plot(history1.history['accuracy'],label="train")
plt.plot(history1.history['val_accuracy'],label="Validation")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.title("Accuracy change over epoch Model1[Dropout at each layer]")
plt.legend()

Thought this model is seems good still we can see that our model has converged at epoch 25; therefore,let us try running the model 1Model1(Dropout each layer) by  reducing the epochs to 25 and see how the results look.

In [None]:
history2 = model1.fit(train_generator,
                    steps_per_epoch=STEP_SIZE_TRAIN,
                    validation_data=valid_generator,
                    validation_steps=STEP_SIZE_VALID,
                    epochs=25, verbose=1
)

In [None]:
import gc
gc.collect()
plt.figure(figsize=(20,5))
plt.plot(history2.history['loss'],'-o',label="train")
plt.plot(history2.history['val_loss'],'-o',label="Validation")
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Loss change over epoch for Model1[Dropout at each layer] epochs to 25")
plt.legend()
plt.grid()

In [None]:
import gc
gc.collect()
plt.figure(figsize=(20,5))
plt.plot(history2.history['accuracy'],label="train")
plt.plot(history2.history['val_accuracy'],label="Validation")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.title("Accuracy change over epoch Model1[Dropout at each layer] epochs to 25")
plt.legend()

From the above graphs we see that the model is overfitting and therefore we can use the model with epochs 30.

In [None]:
import os
test_set = os.listdir('../input/histopathologic-cancer-detection/test/')

In [None]:
test_df = pd.DataFrame(test_set)
test_df.head()


In [None]:
test_datagen=ImageDataGenerator(rescale=1/255)

test_generator=test_datagen.flow_from_dataframe(dataframe=test_df,directory="../input/histopathologic-cancer-detection/test/",
                x_col="id",batch_size=64,seed=1234,shuffle=False,
                class_mode=None,target_size=(96,96))

In [None]:
STEP_SIZE_TEST=test_generator.n/2

preds = model1.predict_generator(generator=test_generator,steps=STEP_SIZE_TEST, verbose = 1)

In [None]:
print(type(pred_test))
print(pred_test[:50])

In [None]:
predictions = []

for item in preds:
    if item.all() >= 0.5:
        predictions.append(1)
    else:
        predictions.append(0)
        
predictions[:10]

In [None]:
submission = test_df.copy()
submission['id']=submission['id'].str[:-4]
submission['label']=predictions
submission.head()

In [None]:
submission.to_csv('submission.csv',index=False)

In [None]:
model2 = Sequential()



# Conv1

model2.add(Conv2D(32, (3,3), input_shape=(96,96,3),padding='same'))
model2.add(Activation('relu'))
model2.add(MaxPool2D((2,2)))
model2.add(Dropout(0.5))


# Conv2

model2.add(Conv2D(64, (3,3),padding='valid'))
model2.add(Activation('relu'))
model2.add(MaxPool2D((2,2)))
model2.add(Dropout(0.5))

# Conv3

model2.add(Conv2D(128, (3,3),padding='valid'))
model2.add(Activation('relu'))
model2.add(MaxPool2D((2,2)))
model2.add(Dropout(0.5))

# Conv4

model2.add(Conv2D(256, (3,3),padding='valid'))
model2.add(Activation('relu'))
model2.add(MaxPool2D((2,2)))
model2.add(Dropout(0.5))

# Conv5
model2.add(Conv2D(512, (3,3),padding='valid'))
model2.add(Activation('relu'))
model2.add(MaxPool2D((2,2)))
model2.add(Dropout(0.5))

# FC

model2.add(Flatten())
model2.add(Dense(512, activation='sigmoid'))
model2.add(Dropout(0.5))

#op
model2.add(Dropout(0.5))
model2.add(Dense(2))
model2.add(Activation('sigmoid'))

model2.summary()







In [None]:
model2.compile(loss='sparse_categorical_crossentropy',           
              optimizer='adam', metrics=['accuracy'])

In [None]:
history2 = model1.fit(train_generator,
                    steps_per_epoch=STEP_SIZE_TRAIN,
                    validation_data=valid_generator,
                    validation_steps=STEP_SIZE_VALID,
                    epochs=25, verbose=1
)