<a href="https://colab.research.google.com/github/gowtham91m/Predicting-IDC-in-Breast-Cancer-Histology-Images/blob/master/IDC.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install Kaggle API for download competition data
!pip3 install -q kaggle
import os,shutil,fnmatch, random
from glob import glob
from time import time
import numpy as np
import cv2
import keras
from keras.callbacks import Callback, EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from keras.preprocessing.image import ImageDataGenerator
from keras.utils.np_utils import to_categorical
from keras.models import Sequential, model_from_json, Model
from keras.optimizers import SGD, RMSprop, Adam, Adagrad, Adadelta
from keras.layers import Dense, Dropout, Activation, Flatten, BatchNormalization, Conv2D, MaxPool2D, MaxPooling2D
from sklearn.utils import class_weight

from google.colab import files

Using TensorFlow backend.


In [2]:
root_dir = '/content'
data_dir = os.path.join(root_dir,'IDC')
download_dir = os.path.join(data_dir,'breast-histopathology-images')
train_path = os.path.join(data_dir,'train')
val_path = os.path.join(data_dir,'val')
train_neg_path = os.path.join(train_path,'0')
train_pos_path = os.path.join(train_path,'1')
val_neg_path = os.path.join(val_path,'0')
val_pos_path = os.path.join(val_path,'1')

neg_class_images = os.path.join(download_dir, 'IDC_regular_ps50_idx5/**/0/*.png')
pos_class_images = os.path.join(download_dir, 'IDC_regular_ps50_idx5/**/1/*.png')

os.chdir(root_dir)
os.environ['KAGGLE_CONFIG_DIR'] = root_dir
os.chdir(root_dir)
if 'kaggle.json' not in os.listdir(root_dir):downloaded = files.upload()

if 'IDC' not in os.listdir(root_dir):
  os.mkdir(data_dir)
  os.chdir(data_dir)

  !kaggle datasets download -d paultimothymooney/breast-histopathology-images
  !unzip -q -o breast-histopathology-images.zip -d breast-histopathology-images
  os.chdir(download_dir)
  !unzip -q -o IDC_regular_ps50_idx5.zip -d IDC_regular_ps50_idx5

  if 'train' not in os.listdir(data_dir): os.mkdir(train_path)
  if '0' not in os.listdir(train_path): os.mkdir(train_neg_path)
  if '1' not in os.listdir(train_path): os.mkdir(train_pos_path)

  if 'val' not in os.listdir(data_dir): os.mkdir(val_path)
  if '0' not in os.listdir(val_path): os.mkdir(val_neg_path)
  if '1' not in os.listdir(val_path): os.mkdir(val_pos_path)
    
  negative_class_images = glob(neg_class_images, recursive=True)  
  positive_class_images = glob(pos_class_images, recursive=True)

  for file in negative_class_images: shutil.copy2(file,train_neg_path)
  for file in positive_class_images: shutil.copy2(file,train_pos_path)

  # split train date into train and validation
  train_neg_len = len(os.listdir(train_neg_path))
  val_neg_len = train_neg_len * 0.3

  train_pos_len = len(os.listdir(train_pos_path))
  val_pos_len = train_pos_len * 0.3

  val_pos = random.sample(os.listdir(train_pos_path),int(val_pos_len))
  val_neg = random.sample(os.listdir(train_neg_path),int(val_neg_len))


  for file in val_pos:
    try: shutil.move(os.path.join(train_pos_path,file), val_pos_path)
    except: pass
  for file in val_neg:
    try: shutil.move(os.path.join(train_neg_path,file), val_neg_path)
    except: pass


Saving kaggle.json to kaggle.json
Downloading breast-histopathology-images.zip to /content/IDC
100% 1.48G/1.49G [00:14<00:00, 106MB/s]
100% 1.49G/1.49G [00:14<00:00, 110MB/s]


In [3]:
print('length of negative images',len(negative_class_images))
print('length of positive images',len(positive_class_images))

# y_train = [0]*len(negative_class_images) +[1]*len(positive_class_images)

# class_weights = class_weight.compute_class_weight('balanced',
#                                                  np.unique(y_train),
#                                                  y_train)

print('train negative images',len(os.listdir(train_neg_path)))
print('train positive image',len(os.listdir(train_pos_path)))

print('val negative images',len(os.listdir(val_neg_path)))
print('val positive images',len(os.listdir(val_pos_path)))

length of negative images 198738
length of positive images 78786
train negative images 139117
train positive image 55151
val negative images 59621
val positive images 23635


# CNN

In [13]:
train_datagen = ImageDataGenerator(
        rescale=1./255,
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True)
val_datagen = ImageDataGenerator(rescale=1./255)

batch_size=128
train_generator = train_datagen.flow_from_directory(
        train_path,  # this is the target directory
        target_size=(50, 50),  # all images will be resized to 150x150
        batch_size=batch_size,
        class_mode='binary') 


validation_generator = val_datagen.flow_from_directory(
        val_path,
        target_size=(50, 50),
        batch_size=batch_size,
        class_mode='binary')

model = Sequential()
model.add(Conv2D(128, (3, 3), input_shape=( 50, 50, 3)))
model.add(Activation('relu'))
#model.add(BatchNormalization(axis=-1))
#model.add(MaxPooling2D(pool_size=(2,2)))

model.add(Conv2D(64, (3, 3)))
model.add(Activation('relu'))
#model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(64, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(64, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Flatten())  # this converts our 3D feature maps to 1D feature vectors
model.add(Dense(32))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('sigmoid'))


model.compile(loss='binary_crossentropy',
              optimizer=Adam(0.001),
              metrics=['accuracy'])

checkpoint = ModelCheckpoint('./base.model', monitor='val_loss', save_best_only=True, save_weights_only=False, mode='min', period=1)
reduceLROnPlato = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, verbose=1, mode='min')

start_time = time()
model.fit_generator(
        train_generator, 
        epochs=50,
        validation_data=validation_generator,
        #class_weight = class_weights,
         callbacks=[checkpoint, reduceLROnPlato, EarlyStopping(patience=8)])
model.save_weights('cnn.h5')

print('time taken ',time()-start_time)

Found 194268 images belonging to 2 classes.
Found 83256 images belonging to 2 classes.
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50

Epoch 00020: ReduceLROnPlateau reducing learning rate to 0.00010000000474974513.
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50

# Transfer learning

In [0]:
#BASE_MODEL = 'VGG16'
from keras.preprocessing.image import ImageDataGenerator

def transfer_learning(BASE_MODEL,optimizer):
  if BASE_MODEL=='VGG16':
      from keras.applications.vgg16 import VGG16 as PTModel, preprocess_input
  elif BASE_MODEL=='vgg19':
      from keras.applications.vgg19 import VGG19 as PTModel, preprocess_input
  elif BASE_MODEL=='ResNet50':
      from keras.applications.resnet50 import ResNet50 as PTModel, preprocess_input
  elif BASE_MODEL=='InceptionV3':
      from keras.applications.inception_v3 import InceptionV3 as PTModel, preprocess_input
  elif BASE_MODEL=='Xception':
      from keras.applications.xception import Xception as PTModel, preprocess_input
  elif BASE_MODEL=='DenseNet169': 
      from keras.applications.densenet import DenseNet169 as PTModel, preprocess_input
  elif BASE_MODEL=='DenseNet121':
      from keras.applications.densenet import DenseNet121 as PTModel, preprocess_input
  else:
      raise ValueError('Unknown model: {}'.format(BASE_MODEL))
  
  import keras
  keras.backend.set_learning_phase(1)
  
  check_point_name = BASE_MODEL + '.model'
  model_weights = BASE_MODEL + '.h5'

  train_datagen = ImageDataGenerator(
          rescale=1./255,
          shear_range=0.2,
          zoom_range=0.2,
          horizontal_flip=True,
          preprocessing_function = preprocess_input)

  val_datagen = ImageDataGenerator(rescale=1./255,
                                  preprocessing_function = preprocess_input)

  batch_size=128
  train_generator = train_datagen.flow_from_directory(
          train_path,  # this is the target directory
          target_size=(250, 250),  # all images will be resized to 150x150
          batch_size=batch_size,
          class_mode='binary') 

  validation_generator = val_datagen.flow_from_directory(
          val_path,
          target_size=(250, 250),
          batch_size=batch_size,
          class_mode='binary')    

  img_rows, img_cols, img_channel = 250, 250, 3
  base_model = PTModel(weights='imagenet'
                     ,include_top=False, input_shape=(img_rows, img_cols, img_channel), classes = 2)

  add_model = Sequential()
  add_model.add(Flatten(input_shape=base_model.output_shape[1:]))
  add_model.add(Dense(64, activation='relu'))
  add_model.add(Dense(1, activation='sigmoid'))

  model = Model(inputs=base_model.input, outputs=add_model(base_model.output))


  for layer in base_model.layers:
      layer.trainable = False

      if layer.name.startswith('bn'):
          layer.call(layer.input, training=False)


  model.compile(loss='binary_crossentropy', 
                optimizer=optimizer,
                metrics=['accuracy'])
  start_time = time()
  model.fit_generator(
          train_generator,
          epochs=8,
          validation_data=validation_generator,
          #class_weight = class_weights,
          callbacks=[ModelCheckpoint(check_point_name, monitor='val_acc', save_best_only=True)])
  model.save_weights(model_weights)

  print('time taken ',time()-start_time)

In [0]:
optimizer = Adam(lr=1e-3)
transfer_learning('VGG16', optimizer)

Found 194268 images belonging to 2 classes.
Found 83256 images belonging to 2 classes.
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8

In [0]:
optimizer = Adam(lr=1e-3)
transfer_learning('DenseNet169', optimizer)

Found 194268 images belonging to 2 classes.
Found 83256 images belonging to 2 classes.
Downloading data from https://github.com/fchollet/deep-learning-models/releases/download/v0.8/densenet169_weights_tf_dim_ordering_tf_kernels_notop.h5
Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8

In [0]:
from keras.preprocessing.image import ImageDataGenerator
import tensorflow as tf   
import keras
keras.backend.set_learning_phase(1)

def transfer_learning(BASE_MODEL,optimizer):
  if BASE_MODEL=='VGG16':
      from keras.applications.vgg16 import VGG16 as PTModel, preprocess_input
  elif BASE_MODEL=='vgg19':
      from keras.applications.vgg19 import VGG19 as PTModel, preprocess_input
  elif BASE_MODEL=='ResNet50':
      from keras.applications.resnet50 import ResNet50 as PTModel, preprocess_input
  elif BASE_MODEL=='InceptionV3':
      from keras.applications.inception_v3 import InceptionV3 as PTModel, preprocess_input
  elif BASE_MODEL=='Xception':
      from keras.applications.xception import Xception as PTModel, preprocess_input
  elif BASE_MODEL=='DenseNet169': 
      from keras.applications.densenet import DenseNet169 as PTModel, preprocess_input
  elif BASE_MODEL=='DenseNet121':
      from keras.applications.densenet import DenseNet121 as PTModel, preprocess_input
  else:
      raise ValueError('Unknown model: {}'.format(BASE_MODEL))
  
  import keras
  keras.backend.set_learning_phase(1)
  
  check_point_name = BASE_MODEL + '.model'
  model_weights = BASE_MODEL + '.h5'

  train_datagen = ImageDataGenerator(
          rescale=1./255,
          shear_range=0.2,
          zoom_range=0.2,
          horizontal_flip=True,
          preprocessing_function = preprocess_input)

  val_datagen = ImageDataGenerator(rescale=1./255,
                                  preprocessing_function = preprocess_input)

  batch_size=128
  train_generator = train_datagen.flow_from_directory(
          train_path,  # this is the target directory
          target_size=(250, 250),  # all images will be resized to 150x150
          batch_size=batch_size,
          class_mode='binary') 

  validation_generator = val_datagen.flow_from_directory(
          val_path,
          target_size=(250, 250),
          batch_size=batch_size,
          class_mode='binary')    

  img_rows, img_cols, img_channel = 250, 250, 3
  base_model = PTModel(weights='imagenet'
                     ,include_top=False, input_shape=(img_rows, img_cols, img_channel), classes = 2)

  add_model = Sequential()
  add_model.add(Flatten(input_shape=base_model.output_shape[1:]))
  add_model.add(Dense(64, activation='relu'))
  add_model.add(Dense(1, activation='sigmoid'))

  model = Model(inputs=base_model.input, outputs=add_model(base_model.output))


  for layer in base_model.layers:
      layer.trainable = False

      if layer.name.startswith('bn'):
          layer.call(layer.input, training=False)

  TPU_WORKER = 'grpc://' + os.environ['COLAB_TPU_ADDR']
  tpu_model = tf.contrib.tpu.keras_to_tpu_model(
                                                model,
                                                strategy=tf.contrib.tpu.TPUDistributionStrategy(
                                                tf.contrib.cluster_resolver.TPUClusterResolver(TPU_WORKER)))

  tpu_model.compile(
      optimizer=tf.train.AdamOptimizer(learning_rate = 0.001),   
      loss='binary_crossentropy',
      metrics=['accuracy'])          

  
  start_time = time()
  tpu_model.fit_generator(
          train_generator,
          epochs=8,
          validation_data=validation_generator,
          #class_weight = class_weights,
          callbacks=[ModelCheckpoint(check_point_name, monitor='val_acc', save_best_only=True)])
  model.save_weights(model_weights)

  print('time taken ',time()-start_time)

In [15]:
optimizer = Adam(lr=1e-3)
transfer_learning('DenseNet169', optimizer)

Found 194268 images belonging to 2 classes.
Found 83256 images belonging to 2 classes.
INFO:tensorflow:Querying Tensorflow master (b'grpc://10.92.91.162:8470') for TPU system metadata.
INFO:tensorflow:Found TPU system:
INFO:tensorflow:*** Num TPU Cores: 8
INFO:tensorflow:*** Num TPU Workers: 1
INFO:tensorflow:*** Num TPU Cores Per Worker: 8
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, -1, 11139267846929878533)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 17179869184, 4851780415960072533)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_GPU:0, XLA_GPU, 17179869184, 1961844471428908267)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 17179869184, 11077741763262080519)
INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:

AttributeError: ignored