In [1]:
from sklearn.datasets import load_files       
from keras.utils import np_utils
import numpy as np
from glob import glob

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# define function to load train, test, and validation datasets
def load_dataset(path, number_of_classes):
    data = load_files(path)
    x = np.array(data['filenames'])
    y = np_utils.to_categorical(np.array(data['target']), number_of_classes)
    return x, y

In [3]:
from keras import applications
from keras.preprocessing.image import ImageDataGenerator
from keras import optimizers
from keras.models import Sequential, Model 
from keras.layers import Dropout, Flatten, Dense, GlobalAveragePooling2D, Input, Conv2D, MaxPooling2D, GlobalMaxPooling2D

In [4]:
train_data_dir = "./data/train"
validation_data_dir = "./data/valid"
test_data_dir = "./data/test"
number_of_classes = 3
# load list of dog names
tissue_names = [item[20:-1] for item in sorted(glob("./data/train/*/"))]
img_width, img_height = 3008, 2000
target_width, target_height = 300, 300

In [5]:
# load train, test, and validation datasets
train_files, train_targets = load_dataset(train_data_dir, number_of_classes)
valid_files, valid_targets = load_dataset(validation_data_dir, number_of_classes)
test_files, test_targets = load_dataset(test_data_dir, number_of_classes)

In [6]:
from keras.preprocessing import image                  
from tqdm import tqdm

def path_to_tensor(img_path):
    # loads RGB image as PIL.Image.Image type
    img = image.load_img(img_path, target_size=(target_height, target_width))
    # convert PIL.Image.Image type to 3D tensor with shape (400, 400, 3)
    x = image.img_to_array(img)
    # convert 3D tensor to 4D tensor with shape (1, target_height, target_width, 3) and return 4D tensor
    return np.expand_dims(x, axis=0)

def paths_to_tensor(img_paths):
    list_of_tensors = [path_to_tensor(img_path) for img_path in tqdm(img_paths)]
    return np.vstack(list_of_tensors)

In [7]:
print('There are %d total tissue categories.' % len(tissue_names))
print('There are %s total tissue images.\n' % len(np.hstack([train_files, valid_files, test_files])))
print('There are %d training tissue images.' % len(train_files))
print('There are %d validation tissue images.' % len(valid_files))
print('There are %d test tissue images.'% len(test_files))

There are 3 total tissue categories.
There are 2750 total tissue images.

There are 2000 training tissue images.
There are 150 validation tissue images.
There are 600 test tissue images.


In [8]:
from PIL import ImageFile                            
ImageFile.LOAD_TRUNCATED_IMAGES = True                 

# pre-process the data for Keras
train_tensors = paths_to_tensor(train_files).astype('float32')/255
valid_tensors = paths_to_tensor(valid_files).astype('float32')/255
test_tensors = paths_to_tensor(test_files).astype('float32')/255

100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [07:41<00:00,  4.33it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 150/150 [00:55<00:00,  2.71it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 600/600 [05:53<00:00,  1.70it/s]


In [59]:
### Build the network
from keras.layers.normalization import BatchNormalization

model = Sequential()
model.add(Conv2D(64, (3, 3), activation='relu', padding='same', input_shape=(target_height, target_width, 3), name='conv_1'))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(MaxPooling2D((2, 2), strides=(2, 2), name='pool_1'))
model.add(Conv2D(64, (3, 3), activation='relu', padding='same', name='conv_2'))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(MaxPooling2D((2, 2), strides=(2, 2), name='pool_2'))
model.add(Conv2D(128, (3, 3), activation='relu', padding='same', name='conv_3'))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(MaxPooling2D((2, 2), strides=(2, 2), name='pool_3'))
model.add(Conv2D(128, (3, 3), activation='relu', padding='same', name='conv_4'))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(MaxPooling2D((2, 2), strides=(2, 2), name='pool_4'))
#model.add(Conv2D(64, (3, 3), activation='relu', padding='same', name='conv_5'))
#model.add(BatchNormalization())
#model.add(Dropout(0.35))
#model.add(MaxPooling2D((2, 2), strides=(2, 2), name='pool_5'))
#model.add(Conv2D(64, (3, 3), activation='relu', padding='same', name='conv_6'))
#model.add(Dropout(0.35))
#model.add(MaxPooling2D((2, 2), strides=(2, 2), name='pool_6'))

# Classification
model.add(GlobalMaxPooling2D())
model.add(Dense(150, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.2))
#model.add(Dense(100, activation='relu'))
#model.add(Dropout(0.25))
model.add(Dense(3, activation='softmax'))


model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv_1 (Conv2D)              (None, 300, 300, 64)      1792      
_________________________________________________________________
batch_normalization_57 (Batc (None, 300, 300, 64)      256       
_________________________________________________________________
dropout_63 (Dropout)         (None, 300, 300, 64)      0         
_________________________________________________________________
pool_1 (MaxPooling2D)        (None, 150, 150, 64)      0         
_________________________________________________________________
conv_2 (Conv2D)              (None, 150, 150, 64)      36928     
_________________________________________________________________
batch_normalization_58 (Batc (None, 150, 150, 64)      256       
_________________________________________________________________
dropout_64 (Dropout)         (None, 150, 150, 64)      0         
__________

In [60]:
# Compile model
model.compile(loss = "categorical_crossentropy", optimizer='adam', metrics=["accuracy"])

In [61]:
from keras.callbacks import ModelCheckpoint
checkpointer = ModelCheckpoint(filepath='saved_models/weights.cancer.hdf5', 
                               verbose=1, save_best_only=True)

from keras.preprocessing.image import ImageDataGenerator

datagen = ImageDataGenerator(
        #rotation_range=5,
        width_shift_range=0.2,
        height_shift_range=0.2,
        shear_range=0.15,
        zoom_range=0.2,
        horizontal_flip=True,
        fill_mode='nearest')

epochs = 15

In [62]:
# Fit model on training data
#model.fit(train_tensors, train_targets, 
#          validation_data=(valid_tensors, valid_targets),
#          epochs=epochs, batch_size=2, callbacks=[checkpointer], verbose=1)

model.fit_generator(datagen.flow(train_tensors, train_targets, batch_size=12), 
          validation_data=(valid_tensors, valid_targets),
          epochs=epochs, callbacks=[checkpointer], verbose=1)

Epoch 1/15

Epoch 00001: val_loss improved from inf to 1.04268, saving model to saved_models/weights.cancer.hdf5
Epoch 2/15



Epoch 00002: val_loss did not improve
Epoch 3/15



Epoch 00003: val_loss did not improve
Epoch 4/15



Epoch 00004: val_loss improved from 1.04268 to 1.03328, saving model to saved_models/weights.cancer.hdf5
Epoch 5/15



Epoch 00005: val_loss improved from 1.03328 to 0.96973, saving model to saved_models/weights.cancer.hdf5
Epoch 6/15



Epoch 00006: val_loss improved from 0.96973 to 0.95833, saving model to saved_models/weights.cancer.hdf5
Epoch 7/15



Epoch 00007: val_loss did not improve
Epoch 8/15



Epoch 00008: val_loss improved from 0.95833 to 0.92228, saving model to saved_models/weights.cancer.hdf5
Epoch 9/15



Epoch 00009: val_loss did not improve
Epoch 10/15



Epoch 00010: val_loss did not improve
Epoch 11/15



Epoch 00011: val_loss did not improve
Epoch 12/15



Epoch 00012: val_loss did not improve
Epoch 13/15



Epoch 00013: val_loss improved from 0.92228 to 0.87480, saving model to saved_models/weights.cancer.hdf5
Epoch 14/15



Epoch 00014: val_loss did not improve
Epoch 15/15



Epoch 00015: val_loss did not improve


<keras.callbacks.History at 0x283cff82e10>

In [63]:
model.load_weights('saved_models/weights.cancer.hdf5')

In [64]:
# get index of predicted clas for each image in test set
class_predictions = [np.argmax(model.predict(np.expand_dims(tensor, axis=0))) for tensor in test_tensors]

# report test accuracy
test_accuracy = 100*np.sum(np.array(class_predictions)==np.argmax(test_targets, axis=1))/len(class_predictions)
print('Test accuracy: %.4f%%' % test_accuracy)

Test accuracy: 61.8333%


In [65]:
import csv
class_probabilities = [model.predict(np.expand_dims(tensor, axis=0)) for tensor in test_tensors]

In [66]:
import csv
file_index = 0
with open('results.csv', 'w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile, delimiter=',')
    csv_writer.writerow(['Id'] + ['task_1'] + ['task_2'])
    for result in class_probabilities:
        csv_writer.writerow([test_files[file_index]] + [result[0,0]] + [result[0,2]])
        file_index = file_index + 1