In [1]:
import keras
keras.__version__

Using TensorFlow backend.


'2.1.2'

# Making predictions on test data

In [4]:
from keras.models import load_model

# Load weights of model 
model = load_model('./weights/adam_no_aug_1.h5')
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_15 (Dense)             (None, 128)               6553728   
_________________________________________________________________
dropout_8 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_16 (Dense)             (None, 4250)              548250    
Total params: 7,101,978
Trainable params: 7,101,978
Non-trainable params: 0
_________________________________________________________________


**Loading the test images**

In [5]:
import os
import numpy as np
from keras.preprocessing.image import ImageDataGenerator

home_dir = os.getcwd()
test_dir = os.path.join(home_dir, 'test/')
test_list = os.listdir(test_dir)
test_count = len(test_list)
print(test_count)


15610


In [26]:
from keras.preprocessing import image

# Model was trained using images of this size
image_size = (224,224)

def load_image(img_path):
    img = image.load_img(img_path, target_size = image_size)
    x = image.img_to_array(img)
    x /= 255
    return x

In [7]:
import os
import numpy as np

home_dir = os.getcwd()
fname = os.path.join(home_dir, 'targets.csv') # targets for both train and validation

f = open(fname)
data = f.read()
f.close()

lines = data.split('\n')
header = lines[0].split(',')
lines = lines[1:]
lines = lines[:-1]

ids = [line.split(',')[1] for line in lines]
whale_ids = set(ids) # convert to set to remove duplicats
whale_ids = list(whale_ids) # convert back to list to make it ordered
whale_ids.remove('new_whale')
whale_ids = sorted(whale_ids)
ids_count = len(whale_ids)

print(header)
print(len(lines))
print(ids_count)

['Image', 'Id']
9850
4250


# Baseline
Extracting baseline info. Get the top 5 most frequently occuring ids in train data

In [8]:
from collections import Counter

top_5 = Counter(ids).most_common(5)
top_5_base = []
for i in range(5):
    t = top_5[i][0], top_5[i][1]/len(lines)
    top_5_base.append(t)

top_5_base
p_new_whale = top_5_base[0][1]

## Predicting labels

In [27]:
from keras.applications import Xception

conv_base = Xception(weights='imagenet',
                  include_top=False,
                  input_shape=(image_size[0], image_size[1], 3))

In [28]:
conv_base.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            (None, 224, 224, 3)  0                                            
__________________________________________________________________________________________________
block1_conv1 (Conv2D)           (None, 111, 111, 32) 864         input_2[0][0]                    
__________________________________________________________________________________________________
block1_conv1_bn (BatchNormaliza (None, 111, 111, 32) 128         block1_conv1[0][0]               
__________________________________________________________________________________________________
block1_conv1_act (Activation)   (None, 111, 111, 32) 0           block1_conv1_bn[0][0]            
__________________________________________________________________________________________________
block1_con

In [11]:
# the last layer of the conv_base has dimension of ll_size on one side
ll_size = 7

In [12]:
batch_size = 10

def predict_labels(augmentation):
    # augmentation parameter is true if model was trained in the train_aug.ipynb, otherwise false
    labels = np.zeros(shape=(test_count,ids_count))
    for i in range(1561):
        batch = np.zeros(shape=(batch_size,image_size[0],image_size[1],3))
        for j in range(batch_size):
            batch[j] = load_image(os.path.join(test_dir,test_list[i*batch_size + j]))
        if augmentation:
            labels_batch = model.predict(batch)
        else:  
            features_batch = conv_base.predict(batch)
            features_batch = np.reshape(features_batch,(batch_size,ll_size*ll_size*2048))
            labels_batch = model.predict(features_batch)
        labels[i * batch_size : (i + 1) * batch_size] = labels_batch
    return labels
              

In [13]:
label_prediction = predict_labels(False)

In [16]:
# This function takes in a label vector, and returns the top 5 whale ids in order
import copy
def get_ids(label_vec):
    ids = ''
    vec = copy.copy(label_vec)
    for i in range(5):
        max_value = max(vec)
        max_index = np.where(vec==max_value)
        ids += whale_ids[max_index[0][0]]
        vec[max_index] = -1
        if i is not 4:
            ids += ' '
    return ids

In [23]:
# This function takes in a label vector, and returns the top 5 whale ids in order, or inserts
# new_whale if the proability of a guessed ID is less than p_new
p_new = .6
def get_ids_or_new_whale(label_vec):
    ids = ''
    vec = copy.copy(label_vec)
    new_whale_guessed = False
    for i in range(5):
        max_value = max(vec)
        if max_value < p_new and new_whale_guessed != True:
            ids += 'new_whale'
            new_whale_guessed = True
        else:
            max_index = np.where(vec==max_value)
            ids += whale_ids[max_index[0][0]]
            vec[max_index] = -1
        if i is not 4:
            ids += ' '
    return ids

In [18]:
def top_5_prediction(label_vec):
    top_5_pred = []
    vec = copy.copy(label_vec)
    for i in range(5):
        max_value = max(vec)
        max_index = np.where(vec==max_value)
        max_id = whale_ids[max_index[0][0]]
        p = max_id, max_value
        top_5_pred.append(p)
        vec[max_index] = -1
    return top_5_pred
    

In [20]:
for i in range(10):
    print(max(label_prediction[i]))
    print('  ' + get_ids_or_new_whale(label_prediction[i]))

0.226226791739
  new_whale w_88e679c w_eb0a6ed w_654a5bb w_8bdc211
0.998743355274
  w_8209e5c new_whale w_540fd73 w_1287fbc w_18a854b
0.447560459375
  new_whale w_c13a4e3 w_e6541e4 w_97f5054 w_159f36b
0.983527123928
  w_7c7a78c new_whale w_cd88a48 w_9ca943b w_fed0031
0.647163748741
  new_whale w_5f50b5e w_3674103 w_0fea5a3 w_5297ab3
0.350268483162
  new_whale w_d9fdd15 w_dfbfe10 w_c9e1cdc w_014250a
0.677623450756
  new_whale w_73d5489 w_5cb5fc3 w_376a413 w_ebf3f26
0.232395797968
  new_whale w_697c72e w_fb270f3 w_6ee7470 w_4ce0510
0.317415118217
  new_whale w_fe6617a w_e282300 w_f53deac w_33e7def
0.665615320206
  new_whale w_8d83172 w_32a920b w_987a36f w_87c4190


In [24]:
prediction = 'Image,Id\n'
for i in range(15610):
    prediction += test_list[i]
    prediction += ', '
    prediction += get_ids_or_new_whale(label_prediction[i])
    prediction += '\n'


In [25]:
f = open('predictions/prediction_adam_p_new=.6.csv','w')
f.write(prediction)
f.close()