# Carvana Unet Pseudo Labeling

## Imports

In [1]:
from keras.layers.advanced_activations import PReLU
from keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint, TensorBoard
import keras.backend as K
from keras.preprocessing import image
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from optimizers.AdamAccumulate import AdamAccumulate
#from models.u_net import UNet
#from models.u_net_aux import UNet_Aux
from models.u_net_heng import UNet_Heng
from utilities.submit import generate_submit
from utilities import utils_masks as utils
from utilities.losses import weighted_bce_dice_loss, dice_value

%load_ext autoreload
%autoreload 2
%matplotlib inline

Using TensorFlow backend.


## Preparing Data

In [3]:
utils.set_results_reproducible()

input_size = 128
num_pseud_data = 10000

train_path = "inputs/train/{}.jpg" 
train_mask_path = "inputs/train_masks/{}_mask.gif"

test_path = "inputs/test/{}.jpg"
test_mask_path = "inputs/test_masks/{}_mask.gif"

#bboxes = None
bbox_file_path = 'inputs/data_bbox.csv'
bboxes = utils.get_bboxes(bbox_file_path)

train_df = pd.read_csv('inputs/train_masks.csv')
all_ids_train = train_df['img'].map(lambda s: s.split('.')[0])
all_ids_train_split, all_ids_valid_split = train_test_split(all_ids_train, test_size=0.2, random_state=42)

test_df = pd.read_csv('inputs/sample_submission.csv')

groups = [['16', '01', '02']]#,['08', '09', '10'],'04|05|06','12|13|14','15|03','07|11']
ids_train_splits = {}
ids_valid_splits = {}
ids_test_splits = {}
test_df_group = pd.DataFrame([])


for group in groups:
    group_name = group[0]+'|'+group[1]+'|'+group[2]
    df_group = train_df[(train_df.img.str.match('^.*_(' + group_name + ').jpg$'))]
    ids_group = df_group['img'].map(lambda s: s.split('.')[0])
    ids_train_split = pd.Series(list(set(all_ids_train_split).intersection(set(ids_group))))
    ids_valid_split = pd.Series(list(set(all_ids_valid_split).intersection(set(ids_group))))
    ids_train_splits[group_name] = ids_train_split
    ids_valid_splits[group_name] = ids_valid_split
    
    for num in range(len(group)):
        test_df_group = pd.concat([test_df_group, (test_df[(test_df.img.str.match('^.*_(' + group[num] + ').jpg$'))].sample(n=num_pseud_data/3))])
     
    test_df_group = test_df_group.sample(n=num_pseud_data, replace=True)
    ids_test_splits[group_name] = test_df_group['img'].map(lambda s: s.split('.')[0])
    print('group {0}:   #Training = {1}   #Validation = {2}   #Test = {3}'.format(group_name, 
                                                                    len(ids_train_split), 
                                                                    len(ids_valid_split),
                                                                    len(ids_test_splits[group_name])))

def train_generator(batch_size, group, outputs=None):
    return utils.train_generator(train_path, train_mask_path, ids_train_splits[group], 
                                 input_size, batch_size, bboxes, outputs=outputs,
                                 augmentations=['HUE_SATURATION', 'SHIFT_SCALE'])

def valid_generator(batch_size, group, outputs=None):
    return utils.valid_generator(train_path, train_mask_path, ids_valid_splits[group],
                                 input_size, batch_size, bboxes, outputs=outputs)


def pseudo_generator(batch_size, group, accum_iters, outputs=None):
    ids = utils.make_list_ids(ids_train_splits[group], ids_test_splits[group_name], batch_size, accum_iters)
    return utils.pseudo_generator(train_path, train_mask_path, test_path, test_mask_path, ids,
                                 input_size, batch_size, bboxes, outputs=outputs)

group 16|01|02:   #Training = 758   #Validation = 196   #Test = 10000


## Create Model

In [3]:
x = next(pseudo_generator(16, '16|01|02', 1))

In [None]:
print np.shape(x[0])
print np.shape(x[1])

In [None]:
idx = 8
plt.figure(figsize=(10, 10))
plt.subplot(1, 2, 1)
plt.imshow(np.squeeze(x[1][idx]))
plt.subplot(1, 2, 2)
plt.imshow(np.squeeze(x[0][idx]))

In [4]:
#U-Net-Aux:
#model = UNet_Aux((input_size, input_size, 3), filters=64, depth=4, dropout_base_only=False, dropout=0,
#                 activation=lambda x: PReLU()(x), init='he_uniform', auxiliaries=[False, True, True, False])
#outputs = {'aux_out1':2**-1, 'aux_out2':2**-2, 'main_out':1}
#weights = {'aux_out1':0.2, 'aux_out2':0.05, 'main_out':1.}
#model.compile(optimizer=AdamAccumulate(accum_iters=4), 
#              loss=weighted_bce_dice_loss, metrics=[dice_value], loss_weights=weights)

#U-Net:
#model = UNet((input_size, input_size, 3), filters=64, depth=4, dropout_base_only=False, dropout=0,
#             activation=lambda x: PReLU()(x), init='he_uniform')
#model.compile(optimizer=AdamAccumulate(accum_iters=4), loss=weighted_bce_dice_loss, metrics=[dice_value])

models = {}
for group in groups:
    group_name = group[0]+'|'+group[1]+'|'+group[2]
    model = UNet_Heng((input_size, input_size, 3))
    models[group_name] = model

## Fit Models

In [5]:
epochs = 150
batch_size = 16
accum_iters = 4

for group in groups:
    group_name = group[0]+'|'+group[1]+'|'+group[2]
    model = models[group_name]
    model.compile(optimizer=AdamAccumulate(accum_iters=accum_iters),
                  loss=weighted_bce_dice_loss, metrics=[dice_value])
    run_name = utils.get_run_name('weights/{}.hdf5', 'unet-heng-{}'.format(group_name))
    weights_path = 'weights/{}.hdf5'.format(run_name)

    callbacks = [EarlyStopping(monitor='val_dice_value',
                               patience=8,
                               verbose=1,
                               min_delta=1e-4,
                               mode='max'),
                 ReduceLROnPlateau(monitor='val_dice_value',
                                   factor=0.1,
                                   patience=4,
                                   verbose=1,
                                   epsilon=1e-4,
                                   mode='max'),
                 ModelCheckpoint(monitor='val_dice_value',
                                 filepath=weights_path,
                                 save_best_only=True,
                                 save_weights_only=True,
                                 mode='max'),
                 TensorBoard(log_dir='logs/{}'.format(run_name), batch_size=batch_size)]

    model.load_weights('weights/unet-heng-2017-09-23-1907.hdf5')
    #K.set_value(model.optimizer.lr, 1e-4)

    num_train_data = len(ids_train_splits[group_name])+len(ids_test_splits[group_name])
    print('Starting run "{}"'.format(run_name))
    model.fit_generator(generator=pseudo_generator(batch_size, group_name, accum_iters),
                        steps_per_epoch=np.ceil(float(num_train_data)/float(batch_size)),
                        epochs=epochs,
                        verbose=1,
                        callbacks=callbacks,
                        validation_data=valid_generator(batch_size, group_name),
                        validation_steps=np.ceil(float(len(ids_valid_splits[group_name])) / float(batch_size)))

Starting run "unet-heng-16|01|02-2017-09-25-0753"
Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 00016: reducing learning rate to 0.00010000000475.
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 00022: reducing learning rate to 1.0000000475e-05.
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 00026: reducing learning rate to 1.00000006569e-06.
Epoch 00026: early stopping


## Create Pseudo Labels

In [1]:
import os
from shutil import copy
import pandas as pd

#### Copy test samples in aspecific direction to a new folder.

In [2]:
src = 'inputs/test_hq/{}'
des = 'inputs/test_hq_06_13'

df = pd.read_csv('inputs/sample_submission.csv')

groups = ['06|13']#,'08|09|10','04|05|06','12|13|14','15|03','07|11']
for group in groups:
    df_group = df[(df.img.str.match('^.*_(' + group + ').jpg$'))]['img']
    if not os.path.exists(des):
        os.makedirs(des) 
    for im_name in df_group:
        copy(src.format(im_name), des)

#### Predict related masks of 

In [None]:
src = 'inputs/test_{}/'
des = 'inputs/test_masks_{}/'

#bboxes = None
bbox_file_path = 'inputs/test_bbox.csv'
bboxes = utils.get_bboxes(bbox_file_path)

groups = [['16', '01', '02']]#,['08', '09', '10'],'04|05|06','12|13|14','15|03','07|11']
for group in groups:
    group_name = group[0]+'|'+group[1]+'|'+group[2]
    model = models[group_name]
    model.load_weights('weights/unet-2017-09-03-1739.hdf5')
    if not os.path.exists(des.format(group_name)):
        os.makedirs(des.format(group_name))
    generate_submit(model, input_size, batch_size=16, threshold=0.5, test_path=src.format(group_name), 
                    submit_path='outputs/',
                    run_name='generate_test_masks_1', test_masks_path=des.format(group_name), bboxes=bboxes)

## Validation

In [None]:
def np_dice_value(y_true, y_pred):
    smooth = 1.
    y_true_f = y_true.flatten()
    y_pred_f = y_pred.flatten()
    intersection = np.sum(y_true_f * y_pred_f)
    return (2. * intersection + smooth) / (np.sum(y_true_f) + np.sum(y_pred_f) + smooth)

### Prediction

In [None]:
run_name = 'unet-2017-09-03-1739'
model.load_weights('weights/{}.hdf5'.format(run_name))

val_imgs, val_masks = next(valid_generator(len(ids_valid_split)))
val_imgs = np.array(val_imgs)
val_masks = np.array(val_masks)
val_pred_masks = model.predict(val_imgs, batch_size=1)
masks_val_dices = [np_dice_value(mask, pred_mask) for (mask, pred_mask) in zip(val_masks, val_pred_masks)]

### Display the worst predicted mask for validation examples

In [None]:
index = np.argsort(masks_val_dices)[7]
id = ids_valid_split.values[index]
utils.show_mask(train_path.format(id), val_masks[index].squeeze(), val_pred_masks[index].squeeze(), show_img=False)
print id, masks_val_dices[index]

In [None]:
indices = np.argsort(masks_val_dices[masks_val_dices <= 99.6])
for id in indices:
        print(masks_val_dices[id])

### Histogram

In [None]:
hist, bins = np.histogram(masks_val_dices, bins=50)
width = 0.7 * (bins[1] - bins[0])
center = (bins[:-1] + bins[1:]) / 2
plt.bar(center, hist, align='center', width=width)
plt.show()

### Visualization

In [None]:
indices = np.random.randint(len(ids_valid_split), size=3)
for index in indices:
    id = ids_valid_split.values[index]
    utils.show_mask(train_path.format(id), val_masks[index].squeeze(), val_pred_masks[index].squeeze(),
                    show_img=True, bbox = bboxes[id])

## Test

### Load Model

In [None]:
# Create model first if required
run_name = 'unet-2017-08-20-5'
model.load_weights('weights/{}.hdf5'.format(run_name))

### Generate Submit

In [None]:
batch_size = 16
threshold = 0.5
test_path = 'inputs/test1/' #'inputs/test/'
test_masks_path = 'outputs/test1_masks/' #None
generate_submit(model, input_size, batch_size, threshold, test_path, 'outputs/', run_name, test_masks_path)

### Visualization

In [None]:
utils.show_test_masks(test_path, test_masks_path)