[View in Colaboratory](https://colab.research.google.com/github/gowtham91m/Predicting-IDC-in-Breast-Cancer-Histology-Images/blob/master/IDC.ipynb)

In [0]:
# Install Kaggle API for download competition data
!pip3 install -q kaggle
import os,shutil,fnmatch, random
from glob import glob
from time import time
import cv2
import keras
from keras.callbacks import Callback, EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from keras.preprocessing.image import ImageDataGenerator
from keras.utils.np_utils import to_categorical
from keras.models import Sequential, model_from_json
from keras.optimizers import SGD, RMSprop, Adam, Adagrad, Adadelta
from keras.layers import Dense, Dropout, Activation, Flatten, BatchNormalization, Conv2D, MaxPool2D, MaxPooling2D

In [4]:
os.chdir('/content')
if 'kaggle.txt' in ('/content'): os.remove('kaggle.text')
from google.colab import files
downloaded = files.upload()

Saving kaggle.txt to kaggle (3).txt


In [5]:
with open('kaggle.txt') as f:
    key = f.read()
os.environ['KAGGLE_USERNAME']="gowham91m"
os.environ['KAGGLE_KEY']=key
 
!kaggle datasets download -d paultimothymooney/breast-histopathology-images

Downloading breast-histopathology-images.zip to /content
100% 1.48G/1.49G [00:10<00:00, 102MB/s]
100% 1.49G/1.49G [00:10<00:00, 150MB/s]


In [0]:
!unzip -q -o breast-histopathology-images.zip -d breast-histopathology-images
os.chdir('/content/breast-histopathology-images')
!unzip -q -o IDC_regular_ps50_idx5.zip -d IDC_regular_ps50_idx5
os.chdir('/content/breast-histopathology-images/IDC_regular_ps50_idx5')

In [7]:
images = glob('/content/breast-histopathology-images/IDC_regular_ps50_idx5/**/0/*.png', recursive=True)
print(len(images))

198738


In [8]:
images = glob('/content/breast-histopathology-images/IDC_regular_ps50_idx5/**/1/*.png', recursive=True)
print(len(images))

78786


In [0]:
if 'IDC' in os.listdir('/content'): shutil.rmtree('/content/IDC')
if 'IDC' not in os.listdir('/content'): os.mkdir('/content/IDC')
  
if 'train' not in os.listdir('/content/IDC'): os.mkdir('/content/IDC/train')
if '0' not in os.listdir('/content/IDC/train'): os.mkdir('/content/IDC/train/0')
if '1' not in os.listdir('/content/IDC/train'): os.mkdir('/content/IDC/train/1')
  
if 'val' not in os.listdir('/content/IDC'): os.mkdir('/content/IDC/val')
if '0' not in os.listdir('/content/IDC/val'): os.mkdir('/content/IDC/val/0')
if '1' not in os.listdir('/content/IDC/val'): os.mkdir('/content/IDC/val/1')

In [0]:
for file in glob('/content/breast-histopathology-images/IDC_regular_ps50_idx5/**/0/*.png'):
    shutil.copy2(file, '/content/IDC/train/0');
for file in glob('/content/breast-histopathology-images/IDC_regular_ps50_idx5/**/1/*.png'):
    shutil.copy2(file, '/content/IDC/train/1');

In [0]:
train_neg_path = '/content/IDC/train/0'
train_pos_path = '/content/IDC/train/1'

val_neg_path = '/content/IDC/val/0'
val_pos_path = '/content/IDC/val/1'

# split train date into train and validation
train_neg_len = len(os.listdir('/content/IDC/train/0'))
val_neg_len = train_neg_len * 0.3

train_pos_len = len(os.listdir('/content/IDC/train/1'))
val_pos_len = train_pos_len * 0.3

val_pos = random.sample(os.listdir(train_pos_path),int(val_pos_len))
val_neg = random.sample(os.listdir(train_neg_path),int(val_neg_len))

In [0]:
for file in val_pos:
  try: shutil.move(os.path.join(train_pos_path,file), val_pos_path)
  except: pass
for file in val_neg:
  try: shutil.move(os.path.join(train_neg_path,file), val_neg_path)
  except: pass

In [27]:
print(len(os.listdir('/content/IDC/train/0')))
print(len(os.listdir('/content/IDC/train/1')))

print(len(os.listdir('/content/IDC/val/0')))
print(len(os.listdir('/content/IDC/val/1')))

139117
55151
59621
23635


In [0]:
train_datagen = ImageDataGenerator(
        rescale=1./255,
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True)
val_datagen = ImageDataGenerator(rescale=1./255)

In [29]:
batch_size=16
train_generator = train_datagen.flow_from_directory(
        '/content/IDC/train',  # this is the target directory
        target_size=(150, 150),  # all images will be resized to 150x150
        batch_size=batch_size,
        class_mode='binary') 

Found 194268 images belonging to 2 classes.


In [30]:
validation_generator = val_datagen.flow_from_directory(
        '/content/IDC/val',
        target_size=(150, 150),
        batch_size=batch_size,
        class_mode='binary')

Found 83256 images belonging to 2 classes.


In [0]:
model = Sequential()
model.add(Conv2D(64, (3, 3), input_shape=( 150, 150, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2,2)))

model.add(Conv2D(32, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(64, (3, 3)))
model.add(Activation('relu'))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Flatten())  # this converts our 3D feature maps to 1D feature vectors
model.add(Dense(64))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

In [35]:

start_time = time()
model.fit_generator(
        train_generator,
        steps_per_epoch=2000 // batch_size,
        epochs=16,
        validation_data=validation_generator,
        validation_steps=800 // batch_size)
model.save_weights('first_try.h5')

print('time taken ',time()-start_time)

Epoch 1/16
Epoch 2/16
Epoch 3/16
Epoch 4/16
Epoch 5/16
Epoch 6/16
Epoch 7/16
Epoch 8/16
Epoch 9/16
Epoch 10/16
Epoch 11/16
Epoch 12/16
Epoch 13/16
Epoch 14/16
Epoch 15/16
Epoch 16/16
time taken  228.30275416374207
