In [5]:
from sklearn.datasets import load_files       
from keras.utils import np_utils
import numpy as np
from glob import glob

In [43]:
# define function to load train, test, and validation datasets
def load_dataset(path, number_of_classes):
    data = load_files(path)
    x = np.array(data['filenames'])
    y = np_utils.to_categorical(np.array(data['target']), number_of_classes)
    return x, y

In [7]:
from keras import applications
from keras.preprocessing.image import ImageDataGenerator
from keras import optimizers
from keras.models import Sequential, Model 
from keras.layers import Dropout, Flatten, Dense, GlobalAveragePooling2D, Input, Conv2D, MaxPooling2D, GlobalMaxPooling2D

In [45]:
train_data_dir = "./data/train"
validation_data_dir = "./data/valid"
test_data_dir = "./data/test"
number_of_classes = 3
# load list of dog names
tissue_names = [item[20:-1] for item in sorted(glob("./data/train/*/"))]
img_width, img_height = 3008, 2000
target_width, target_height = 300, 300

In [9]:
# load train, test, and validation datasets
train_files, train_targets = load_dataset(train_data_dir, number_of_classes)
valid_files, valid_targets = load_dataset(validation_data_dir, number_of_classes)
test_files, test_targets = load_dataset(test_data_dir, number_of_classes)

In [46]:
from keras.preprocessing import image                  
from tqdm import tqdm

def path_to_tensor(img_path):
    # loads RGB image as PIL.Image.Image type
    img = image.load_img(img_path, target_size=(target_height, target_width))
    # convert PIL.Image.Image type to 3D tensor with shape (400, 400, 3)
    x = image.img_to_array(img)
    # convert 3D tensor to 4D tensor with shape (1, target_height, target_width, 3) and return 4D tensor
    return np.expand_dims(x, axis=0)

def paths_to_tensor(img_paths):
    list_of_tensors = [path_to_tensor(img_path) for img_path in tqdm(img_paths)]
    return np.vstack(list_of_tensors)

In [47]:
print('There are %d total tissue categories.' % len(tissue_names))
print('There are %s total tissue images.\n' % len(np.hstack([train_files, valid_files, test_files])))
print('There are %d training tissue images.' % len(train_files))
print('There are %d validation tissue images.' % len(valid_files))
print('There are %d test tissue images.'% len(test_files))

There are 3 total tissue categories.
There are 2750 total tissue images.

There are 2000 training tissue images.
There are 150 validation tissue images.
There are 600 test tissue images.


In [48]:
from PIL import ImageFile                            
ImageFile.LOAD_TRUNCATED_IMAGES = True                 

# pre-process the data for Keras
train_tensors = paths_to_tensor(train_files).astype('float32')/255
valid_tensors = paths_to_tensor(valid_files).astype('float32')/255
test_tensors = paths_to_tensor(test_files).astype('float32')/255

100%|██████████████████████████████████████████████████████████████████████████████| 2000/2000 [07:21<00:00,  4.53it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 150/150 [00:51<00:00,  2.89it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 600/600 [05:46<00:00,  1.73it/s]


In [49]:
### Build the network 
model = Sequential()
model.add(Conv2D(64, (3, 3), activation='relu', padding='same', input_shape=(target_height, target_width, 3), name='conv_1'))
model.add(Dropout(0.25))
model.add(MaxPooling2D((2, 2), strides=(2, 2), name='pool_1'))
model.add(Conv2D(64, (3, 3), activation='relu', padding='same', name='conv_2'))
model.add(Dropout(0.25))
model.add(MaxPooling2D((2, 2), strides=(2, 2), name='pool_2'))
model.add(Conv2D(128, (3, 3), activation='relu', padding='same', name='conv_3'))
model.add(Dropout(0.25))
model.add(MaxPooling2D((2, 2), strides=(2, 2), name='pool_3'))
model.add(Conv2D(128, (3, 3), activation='relu', padding='same', name='conv_4'))
model.add(Dropout(0.25))
model.add(MaxPooling2D((2, 2), strides=(2, 2), name='pool_4'))
model.add(Conv2D(128, (3, 3), activation='relu', padding='same', name='conv_5'))
model.add(Dropout(0.25))
model.add(MaxPooling2D((2, 2), strides=(2, 2), name='pool_5'))
#model.add(Conv2D(128, (3, 3), activation='relu', padding='same', name='conv_6'))
#model.add(Dropout(0.25))
#model.add(MaxPooling2D((2, 2), strides=(2, 2), name='pool_6'))

# Classification
model.add(GlobalMaxPooling2D())
model.add(Dense(100, activation='relu'))
model.add(Dropout(0.25))
#model.add(Dense(100, activation='relu'))
#model.add(Dropout(0.25))
model.add(Dense(3, activation='softmax'))


model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv_1 (Conv2D)              (None, 300, 300, 64)      1792      
_________________________________________________________________
dropout_21 (Dropout)         (None, 300, 300, 64)      0         
_________________________________________________________________
pool_1 (MaxPooling2D)        (None, 150, 150, 64)      0         
_________________________________________________________________
conv_2 (Conv2D)              (None, 150, 150, 64)      36928     
_________________________________________________________________
dropout_22 (Dropout)         (None, 150, 150, 64)      0         
_________________________________________________________________
pool_2 (MaxPooling2D)        (None, 75, 75, 64)        0         
_________________________________________________________________
conv_3 (Conv2D)              (None, 75, 75, 128)       73856     
__________

In [50]:
# Compile model
model.compile(loss = "categorical_crossentropy", optimizer='adam', metrics=["accuracy"])

In [51]:
from keras.callbacks import ModelCheckpoint
checkpointer = ModelCheckpoint(filepath='saved_models/weights.cancer.hdf5', 
                               verbose=1, save_best_only=True)

from keras.preprocessing.image import ImageDataGenerator

datagen = ImageDataGenerator(
        #rotation_range=5,
        width_shift_range=0.15,
        height_shift_range=0.15,
        shear_range=0.15,
        zoom_range=0.15,
        horizontal_flip=True,
        fill_mode='nearest')

epochs = 30

In [52]:
# Fit model on training data
#model.fit(train_tensors, train_targets, 
#          validation_data=(valid_tensors, valid_targets),
#          epochs=epochs, batch_size=2, callbacks=[checkpointer], verbose=1)

model.fit_generator(datagen.flow(train_tensors, train_targets, batch_size=10), 
          validation_data=(valid_tensors, valid_targets),
          epochs=epochs, callbacks=[checkpointer], verbose=1)

Epoch 1/30

Epoch 00001: val_loss improved from inf to 1.03698, saving model to saved_models/weights.cancer.hdf5
Epoch 2/30



Epoch 00002: val_loss improved from 1.03698 to 1.02779, saving model to saved_models/weights.cancer.hdf5
Epoch 3/30



Epoch 00003: val_loss improved from 1.02779 to 1.00947, saving model to saved_models/weights.cancer.hdf5
Epoch 4/30



Epoch 00004: val_loss did not improve
Epoch 5/30



Epoch 00005: val_loss improved from 1.00947 to 0.96689, saving model to saved_models/weights.cancer.hdf5
Epoch 6/30



Epoch 00006: val_loss did not improve
Epoch 7/30



Epoch 00007: val_loss did not improve
Epoch 8/30



Epoch 00008: val_loss did not improve
Epoch 9/30



Epoch 00009: val_loss did not improve
Epoch 10/30



Epoch 00010: val_loss did not improve
Epoch 11/30



Epoch 00011: val_loss did not improve
Epoch 12/30



Epoch 00012: val_loss did not improve
Epoch 13/30



Epoch 00013: val_loss did not improve
Epoch 14/30



Epoch 00014: val_loss did not improve
Epoch 15/30



Epoch 00015: val_loss did not improve
Epoch 16/30



Epoch 00016: val_loss did not improve
Epoch 17/30



Epoch 00017: val_loss did not improve
Epoch 18/30



Epoch 00018: val_loss did not improve
Epoch 19/30



Epoch 00019: val_loss did not improve
Epoch 20/30



Epoch 00020: val_loss did not improve
Epoch 21/30



Epoch 00021: val_loss did not improve
Epoch 22/30



Epoch 00022: val_loss did not improve
Epoch 23/30



Epoch 00023: val_loss did not improve
Epoch 24/30



Epoch 00024: val_loss did not improve
Epoch 25/30



Epoch 00025: val_loss did not improve
Epoch 26/30



Epoch 00026: val_loss did not improve
Epoch 27/30



Epoch 00027: val_loss did not improve
Epoch 28/30



Epoch 00028: val_loss did not improve
Epoch 29/30



Epoch 00029: val_loss did not improve
Epoch 30/30



Epoch 00030: val_loss did not improve


<keras.callbacks.History at 0x26615b70550>

In [53]:
model.load_weights('saved_models/weights.cancer.hdf5')

In [54]:
# get index of predicted clas for each image in test set
class_predictions = [np.argmax(model.predict(np.expand_dims(tensor, axis=0))) for tensor in test_tensors]

# report test accuracy
test_accuracy = 100*np.sum(np.array(class_predictions)==np.argmax(test_targets, axis=1))/len(class_predictions)
print('Test accuracy: %.4f%%' % test_accuracy)

Test accuracy: 65.5000%


In [55]:
import csv
class_probabilities = [model.predict(np.expand_dims(tensor, axis=0)) for tensor in test_tensors]

In [56]:
import csv
file_index = 0
with open('results.csv', 'w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile, delimiter=',')
    csv_writer.writerow(['Id'] + ['task_1'] + ['task_2'])
    for result in class_probabilities:
        csv_writer.writerow([test_files[file_index]] + [result[0,0]] + [result[0,2]])
        file_index = file_index + 1