In [1]:
import numpy as np
import pandas as pd
import os

Reading file names and labels.

In [2]:
train_dir = "data/train/"
test_dir = "data/test/"

df_train = pd.read_csv('data/train_labels.csv')
df_train.head()

Unnamed: 0,id,label
0,f38a6374c348f90b587e046aac6079959adf3835,0
1,c18f2d887b7ae4f6742ee445113fa1aef383ed77,1
2,755db6279dae599ebb4d39a9123cce439965282d,0
3,bc3f0c64fb968ff4a8bd33af6971ecae77c75e08,0
4,068aba587a4950175d04c680d38943fd488d6a9d,0


Keras' ImageDataGenerator needs labels as a string for some reason. So we turn labels into strings.

In [3]:
df_train['label'] = df_train['label'].astype(str)

Splitting data for training and validation. 80% of data will be used for training and 20% for validation.

In [4]:
from sklearn.model_selection import train_test_split

df = df_train
train, valid = train_test_split(df,test_size=0.2)

Overall there are more than 200000 images. We can't fit all of these images to memory. So we create a generator to read the images. 

In [5]:
from tensorflow.python.keras.preprocessing.image import ImageDataGenerator

train_datagen = ImageDataGenerator(horizontal_flip=True,
                                   vertical_flip=True)

test_datagen = ImageDataGenerator()

In [6]:
batch_size = 256

In [7]:
train_generator = train_datagen.flow_from_dataframe(dataframe = train,
                                                    directory='data/train/',
                                                    x_col='id',
                                                    y_col='label',
                                                    has_ext=False,
                                                    batch_size=batch_size,
                                                    shuffle=True,
                                                    class_mode='binary',
                                                    target_size=(96,96))

valid_generator = test_datagen.flow_from_dataframe(dataframe = valid,
                                                   directory='data/train/',
                                                   x_col='id',
                                                   y_col='label',
                                                   has_ext=False,
                                                   batch_size=batch_size,
                                                   shuffle=False,
                                                   class_mode='binary',
                                                   target_size=(96,96))

Found 176019 images belonging to 2 classes.
Found 44005 images belonging to 2 classes.


Creating a pretty basic CNN network with batch normalization. 

In [8]:
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Conv2D, Dense, MaxPool2D, Flatten
from tensorflow.python.keras.layers import BatchNormalization, Activation, Dropout
from tensorflow.python.keras.optimizers import Adam

kernel_size=(3,3)
pool_size=(2,2)
first_filter=32
second_filter=64
third_filter=128

dropout_conv=0.3
dropout_dense=0.3

model = Sequential()
model.add(Conv2D(first_filter, kernel_size, activation='relu', input_shape= (96,96,3)))
model.add(Conv2D(first_filter, kernel_size, use_bias=False))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(MaxPool2D(pool_size=pool_size))
model.add(Dropout(dropout_conv))

model.add(Conv2D(second_filter, kernel_size, use_bias=False))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Conv2D(second_filter, kernel_size, use_bias=False))
model.add(BatchNormalization())
model.add(Activation("relu"))
model.add(MaxPool2D(pool_size = pool_size))
model.add(Dropout(dropout_conv))

model.add(Conv2D(third_filter, kernel_size, use_bias=False))
model.add(BatchNormalization())
model.add(Activation("relu"))
model.add(Conv2D(third_filter, kernel_size, use_bias=False))
model.add(BatchNormalization())
model.add(Activation("relu"))
model.add(MaxPool2D(pool_size = pool_size))
model.add(Dropout(dropout_conv))

model.add(Flatten())
model.add(Dense(256, use_bias=False))
model.add(BatchNormalization())
model.add(Activation("relu"))
model.add(Dropout(dropout_dense))
model.add(Dense(1, activation = "sigmoid"))

model.compile(Adam(0.01), loss = "binary_crossentropy", metrics=["accuracy"])

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [9]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d (Conv2D)              (None, 94, 94, 32)        896       
_________________________________________________________________
conv2d_1 (Conv2D)            (None, 92, 92, 32)        9216      
_________________________________________________________________
batch_normalization_v1 (Batc (None, 92, 92, 32)        128       
_________________________________________________________________
activation (Activation)      (None, 92, 92, 32)        0         
_________________________________________________________________
max_pooling2d (MaxPooling2D) (None, 46, 46, 32)        0         
_________________________________________________________________
dropout (Dropout)            (None, 46, 46, 32)        0         
_________________________________________________________________
conv2d_2 (Conv2D)            (None, 44, 44, 64)        18432     
__________

Training the model.

In [10]:
from tensorflow.python.keras.callbacks import EarlyStopping, ReduceLROnPlateau

STEP_SIZE_TRAIN=train_generator.n//batch_size
STEP_SIZE_VALID=valid_generator.n//batch_size

earlystopper = EarlyStopping(monitor='val_loss', patience=2, verbose=1, restore_best_weights=True)
reducel = ReduceLROnPlateau(monitor='val_loss', patience=1, verbose=1, factor=0.1)

history = model.fit_generator(train_generator, steps_per_epoch=STEP_SIZE_TRAIN, 
                    validation_data=valid_generator,
                    validation_steps=STEP_SIZE_VALID,
                    epochs=10,
                    callbacks=[reducel, earlystopper])

Instructions for updating:
Use tf.cast instead.
Epoch 1/10
Epoch 2/10

Epoch 00002: ReduceLROnPlateau reducing learning rate to 0.0009999999776482583.
Epoch 3/10
Epoch 4/10
Epoch 5/10

Epoch 00005: ReduceLROnPlateau reducing learning rate to 9.999999310821295e-05.
Epoch 6/10
Epoch 7/10

Epoch 00007: ReduceLROnPlateau reducing learning rate to 9.999999019782991e-06.
Epoch 8/10

Epoch 00008: ReduceLROnPlateau reducing learning rate to 9.99999883788405e-07.
Restoring model weights from the end of the best epoch.
Epoch 00008: early stopping


Testing the model.

In [11]:
from glob import glob
from skimage.io import imread

base_test_dir = 'data/test/'
test_files = glob(os.path.join(base_test_dir,'*.tif'))
submission = pd.DataFrame()
file_batch = 10000
max_idx = len(test_files)
for idx in range(0, max_idx, file_batch):
    print("Indexes: %i - %i"%(idx, idx+file_batch))
    test_df = pd.DataFrame({'path': test_files[idx:idx+file_batch]})
    test_df['id'] = test_df.path.map(lambda x: x.split('/')[1].split(".")[0].split('\\')[1])
    test_df['image'] = test_df['path'].map(imread)
    K_test = np.stack(test_df["image"].values)
    predictions = model.predict(K_test)
    test_df['label'] = predictions
    submission = pd.concat([submission, test_df[["id", "label"]]])

Indexes: 0 - 10000
Indexes: 10000 - 20000
Indexes: 20000 - 30000
Indexes: 30000 - 40000
Indexes: 40000 - 50000
Indexes: 50000 - 60000


In [12]:
submission.to_csv('submission.csv', index=False)