In [2]:
import keras
keras.__version__

'2.2.0'

# Making predictions on test data

In [3]:
from keras.models import load_model

# Load weights of model 
model = load_model('./weights/data_aug_no_fine_tune_1.h5')
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
xception (Model)             (None, 7, 7, 2048)        20861480  
_________________________________________________________________
flatten_3 (Flatten)          (None, 100352)            0         
_________________________________________________________________
dense_5 (Dense)              (None, 256)               25690368  
_________________________________________________________________
dropout_3 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_6 (Dense)              (None, 4250)              1092250   
Total params: 47,644,098
Trainable params: 47,589,570
Non-trainable params: 54,528
_________________________________________________________________




**Loading the test images**

In [4]:
import os
import numpy as np
from keras.preprocessing.image import ImageDataGenerator

home_dir = os.getcwd()
test_dir = os.path.join(home_dir, 'test/')
test_list = os.listdir(test_dir)
test_count = len(test_list)
print(test_count)


15610


In [5]:
from keras.preprocessing import image

# Model was trained using images of this size
image_size = (224,224)

def load_image(img_path):
    img = image.load_img(img_path, target_size = image_size)
    x = image.img_to_array(img)
    x /= 255
    return x

In [6]:
import os
import numpy as np

home_dir = os.getcwd()
fname = os.path.join(home_dir, 'targets.csv') # targets for both train and validation

f = open(fname)
data = f.read()
f.close()

lines = data.split('\n')
header = lines[0].split(',')
lines = lines[1:]
lines = lines[:-1]

ids = [line.split(',')[1] for line in lines]
whale_ids = set(ids) # convert to set to remove duplicats
whale_ids = list(whale_ids) # convert back to list to make it ordered
whale_ids.remove('new_whale')
whale_ids = sorted(whale_ids)
ids_count = len(whale_ids)

print(header)
print(len(lines))
print(ids_count)

['Image', 'Id']
9850
4250


# Baseline
Extracting baseline info. Get the top 5 most frequently occuring ids in train data

In [8]:
from collections import Counter

top_5 = Counter(ids).most_common(5)
top_5_base = []
for i in range(5):
    t = top_5[i][0], top_5[i][1]/len(lines)
    top_5_base.append(t)

top_5_base
p_new_whale = top_5_base[0][1]

## Predicting labels

In [31]:
from keras.applications import Xception

conv_base = Xception(weights='imagenet',
                  include_top=False,
                  input_shape=(image_size[0], image_size[1], 3))

In [32]:
conv_base.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            (None, 224, 224, 3)  0                                            
__________________________________________________________________________________________________
block1_conv1 (Conv2D)           (None, 111, 111, 32) 864         input_3[0][0]                    
__________________________________________________________________________________________________
block1_conv1_bn (BatchNormaliza (None, 111, 111, 32) 128         block1_conv1[0][0]               
__________________________________________________________________________________________________
block1_conv1_act (Activation)   (None, 111, 111, 32) 0           block1_conv1_bn[0][0]            
__________________________________________________________________________________________________
block1_con

In [33]:
# the last layer of the conv_base has dimension of ll_size on one side
ll_size = 7

In [7]:
batch_size = 10

def predict_labels(augmentation):
    # augmentation parameter is true if model was trained in the train_aug.ipynb, otherwise false
    labels = np.zeros(shape=(test_count,ids_count))
    for i in range(1561):
        batch = np.zeros(shape=(batch_size,image_size[0],image_size[1],3))
        for j in range(batch_size):
            batch[j] = load_image(os.path.join(test_dir,test_list[i*batch_size + j]))
        if augmentation:
            labels_batch = model.predict(batch)
        else:  
            features_batch = conv_base.predict(batch)
            features_batch = np.reshape(features_batch,(batch_size,ll_size*ll_size*2048))
            labels_batch = model.predict(features_batch)
        labels[i * batch_size : (i + 1) * batch_size] = labels_batch
    return labels
              

In [8]:
label_prediction = predict_labels(True)

In [36]:
# This function takes in a label vector, and returns the top 5 whale ids in order

def get_ids(label_vec):
    ids = ''
    vec = copy.copy(label_vec)
    for i in range(5):
        max_value = max(vec)
        max_index = np.where(vec==max_value)
        ids += whale_ids[max_index[0][0]]
        vec[max_index] = -1
        if i is not 4:
            ids += ' '
    return ids

In [9]:
# This function takes in a label vector, and returns the top 5 whale ids in order, or inserts
# new_whale if the proability of a guessed ID is less than p_new
import copy

p_new = .95
def get_ids_or_new_whale(label_vec):
    ids = ''
    vec = copy.copy(label_vec)
    new_whale_guessed = False
    for i in range(5):
        max_value = max(vec)
        if max_value < p_new and new_whale_guessed != True:
            ids += 'new_whale'
            new_whale_guessed = True
        else:
            max_index = np.where(vec==max_value)
            ids += whale_ids[max_index[0][0]]
            vec[max_index] = -1
        if i is not 4:
            ids += ' '
    return ids

In [42]:
def top_5_prediction(label_vec):
    top_5_pred = []
    vec = copy.copy(label_vec)
    for i in range(5):
        max_value = max(vec)
        max_index = np.where(vec==max_value)
        max_id = whale_ids[max_index[0][0]]
        p = max_id, max_value
        top_5_pred.append(p)
        vec[max_index] = -1
    return top_5_pred
    

In [43]:
for i in range(30):
    print(max(label_prediction[i]))
    print('  ' + get_ids_or_new_whale(label_prediction[i]))

0.823091685772
  new_whale w_b7d5069 w_0d48a7d w_f0f0dbb w_7e8b270
0.999961495399
  w_8209e5c new_whale w_cc504e7 w_71bfc77 w_b0aed4a
0.844835221767
  new_whale w_7ed3719 w_e6541e4 w_2d2de1d w_4643de5
0.998827040195
  w_7c7a78c new_whale w_43be268 w_ee17a08 w_e61dd6d
0.651666581631
  new_whale w_a25caa9 w_eb39613 w_ca8bfb4 w_5317c46
0.954369366169
  w_2d99a0c new_whale w_0d48a7d w_0b3f313 w_dfbfe10
0.350891113281
  new_whale w_0cd401c w_1272a31 w_376a413 w_eb39613
0.836710512638
  new_whale w_3d3c0f9 w_e282300 w_ed117b3 w_4c25641
0.262933343649
  new_whale w_3b483d3 w_ed8a846 w_af890ad w_67de30b
0.628050029278
  new_whale w_96c141f w_89e159a w_17ee910 w_4432150
0.966293096542
  w_fe5e78b new_whale w_648a9a8 w_fce6ab2 w_3ce788a
0.476489156485
  new_whale w_fd1cb9d w_78f2e92 w_fe49bc4 w_96fd936
0.554431319237
  new_whale w_b7d5069 w_44a6b62 w_aef3680 w_5cef366
0.99999153614
  w_43b50e5 new_whale w_44cccf6 w_f8e6546 w_89d9c03
0.999972343445
  w_e430ce0 new_whale w_9989964 w_67fecca w_71e6

In [10]:
prediction = 'Image,Id\n'
for i in range(15610):
    prediction += test_list[i]
    prediction += ', '
    prediction += get_ids_or_new_whale(label_prediction[i])
    prediction += '\n'


In [11]:
f = open('predictions/prediction_data_aug_no_fine_tune_1.csv','w')
f.write(prediction)
f.close()