In [None]:
# Import required libraries
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib
import os
import matplotlib.pyplot as plt
from matplotlib import patches
import cv2
from keras.callbacks import Callback
from keras_preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.layers.convolutional import Conv2D, MaxPooling2D
from keras.layers.core import Activation, Flatten, Dense
from keras.layers import BatchNormalization
from keras.layers import Dropout
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint
from sklearn.metrics import roc_curve,auc, confusion_matrix

In [None]:
print ("Number of train files:",len(os.listdir("../input/train")))
print ("Number of test files:",len(os.listdir("../input/test")))

dftrain=pd.read_csv("../input/train_labels.csv",dtype=str)
dftrain.head()

In [None]:
print("Counts of negative and postive labels in training data:")
dftrain.groupby(['label']).count()

In [None]:
def add_ext(id):
    return id+".tif"

dftrain["id"]=dftrain["id"].apply(add_ext)

def addpath(col):
    return '../input/train/' + col 

dftrain['Path']=dftrain['id'].apply(addpath)

dftrain.head()

In [None]:
## function to plot historgrams

def plothist(plot_img,axnum):
    color = ('b','g','r')
    for j,col in enumerate(color):
         histr = cv2.calcHist([plot_img],[j],None,[256],[0,256])
         ax[axnum,i].plot(histr,color = col)
         ax[axnum,i].set_xlim([0,256])
         ax[axnum,i].set_xlabel("Pixel Values")
         ax[axnum,0].set_ylabel("# of Pixels")
    return 

In [None]:
## print a sample of the images
nums = [76, 46, 69, 20, 17] # random.sample(range(1, 100), 5)
num_pics = len(nums)
f,ax = plt.subplots(3,num_pics,figsize=(15,15))

for i in range(5):
    img = plt.imread(dftrain.iloc[nums[i]]['Path'])
   # ax[i].imshow(img)
   # ax[i].set_title(dfdata.iloc[i]['label'],fontweight="bold", size=20)
    ax[0,i].imshow(img)
    ax[0,i].set_title(dftrain.iloc[i]['label'],fontweight="bold", size=20)
    # Create a Rectangle patch
    rect = patches.Rectangle((32,32),32,32,linewidth=3,edgecolor='r',facecolor='none')
    # Add the patch to the Axes
    ax[0,i].add_patch(rect)
    ## plot histograms of full image and cancer patch
    plothist(img,1)
    plothist(img[32:64, 32:64],2)
    
plt.show() 

In [None]:
## use flow from directory

datagen=ImageDataGenerator(rescale=1./255.,validation_split=0.2)

In [None]:
batch_size = 20
image_size = (96,96)

train_generator=datagen.flow_from_dataframe(
dataframe=dftrain,
directory="../input/train/",
x_col="id",
y_col="label",
subset="training",
batch_size=batch_size,
seed=42,
shuffle=True,
class_mode="binary",
target_size=image_size)

validation_generator=datagen.flow_from_dataframe(
dataframe=dftrain,
directory="../input/train/",
x_col="id",
y_col="label",
subset="validation",
batch_size=batch_size,
seed=42,
shuffle=True,
class_mode="binary",
target_size=image_size)

In [None]:
# build model
input_shape = (96,96,3)
kernel_size = 3

model = Sequential()
model.add(Conv2D(16, (3, 3),input_shape=input_shape)) #input_shape=trainX.shape[1:]))
model.add(Activation('relu'))
model.add(Dropout(0.25))
model.add(MaxPooling2D(pool_size=(2, 2)))

# layer 2
model.add(Conv2D(32, (3, 3))) #,input_shape=input_shape)) #input_shape=trainX.shape[1:]))
model.add(Activation('relu'))
model.add(Dropout(0.25))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(64, (3, 3))) #input_shape=trainX.shape[1:]))
model.add(Activation('relu'))
model.add(Dropout(0.25))
model.add(MaxPooling2D(pool_size=(2, 2)))

model.add(Conv2D(128,(3,3)))
model.add(Activation('relu'))
model.add(Dropout(0.25))
model.add(MaxPooling2D(pool_size=(2, 2)))
                 
model.add(Flatten())
model.add(Dense(1))
model.add(Activation('sigmoid'))

model.compile(Adam(0.001), loss = "binary_crossentropy", metrics=["accuracy"])
model.summary()

In [None]:
trainstep=train_generator.n//train_generator.batch_size
valstep=validation_generator.n//validation_generator.batch_size

filepath="weights-best.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max')

history=model.fit_generator(generator=train_generator,
                    steps_per_epoch=trainstep,
                    validation_data=validation_generator,
                    validation_steps=valstep,
                    epochs=10,
                    callbacks=[checkpoint]
)

In [None]:
# plot learning curves

# summarize history for accuracy
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

# summarize history for loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
## Create test generator and evaluate model 
model.load_weights(filepath) #load saved weights
test_datagen=ImageDataGenerator(rescale=1./255)


test_generator=datagen.flow_from_dataframe(
dataframe=dftrain,
directory="../input/train/",
x_col="id",
y_col="label",
subset="validation",
batch_size=5,   # want to divide num samples evenly 
seed=42,
shuffle=False,  # don't shuffle
class_mode="binary",
target_size=image_size)



In [None]:
scores = model.evaluate_generator(test_generator)
print('Test loss:', round(100*scores[0],2))
print('Test accuracy:', round(100*scores[1],2))
print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))

In [None]:
test_labels = test_generator.classes
y_preds = model.predict_generator(test_generator,verbose=1,steps=test_generator.n/5)
y_pred_keras=y_preds.round()
fpr_keras, tpr_keras, thresholds_keras = roc_curve(test_labels, y_pred_keras)
auc_keras = auc(fpr_keras, tpr_keras)

print('AUC score :', + auc_keras)

In [None]:
# plot ROC curve

plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr_keras, tpr_keras, label='Keras (area = {:.3f})'.format(auc_keras))
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend(loc='best')
plt.show()

In [None]:
classes=[0,1]
cm=confusion_matrix(test_labels,y_pred_keras)

plt.ylabel('True label')
plt.xlabel('Predicted label')
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes)
plt.yticks(tick_marks, classes)
plt.imshow(cm, cmap=plt.cm.Blues)
print(cm)

In [None]:
## predictions for submitting

In [None]:
test_results=pd.DataFrame({'id':os.listdir("../input/test/")})
test_datagen=ImageDataGenerator(rescale=1./255)

submit_generator=datagen.flow_from_dataframe(
dataframe=test_results,
directory="../input/test/",
x_col="id",
batch_size=2,   # want to divide num samples evenly 
shuffle=False,  # don't shuffle
class_mode=None,
target_size=image_size)

In [None]:
## use 0.5 as threshold to assign to class 0 or 1 
y_test_prob=model.predict_generator(submit_generator,verbose=1,steps=submit_generator.n/2)
y_test_pred=y_test_prob.round()

In [None]:
def remove_ext(id):
    return (id.split('.'))[0]
test_results['id']=test_results['id'].apply(remove_ext)

In [None]:
test_results['label'] = y_test_pred
test_results.to_csv("submission.csv",index=False)
test_results.head()

In [None]:
print(submit_generator.class_indices)

**Reference Material:**

I found the following kernels and resources very helpful as I worked through my first Kaggle entry! Thank you!

https://www.kaggle.com/vbookshelf/cnn-how-to-use-160-000-images-without-crashing <br>
https://www.kaggle.com/fmarazzi/baseline-keras-cnn-roc-fast-10min-0-925-lb <br>
 (more to come)
