In [40]:
import keras
keras.__version__

'2.1.2'

# Making predictions on test data

In [51]:
from keras.models import load_model

# Load weights of model trained on AWS
model = load_model('./weights/name_that_whale_702.h5')
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 128)               9437312   
_________________________________________________________________
dropout_1 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 4250)              548250    
Total params: 9,985,562
Trainable params: 9,985,562
Non-trainable params: 0
_________________________________________________________________


**Loading the test images**

In [42]:
import os
import numpy as np
from keras.preprocessing.image import ImageDataGenerator

home_dir = os.getcwd()
test_dir = os.path.join(home_dir, 'test/')
test_list = os.listdir(test_dir)
test_count = len(test_list)
print(test_count)


15610


In [43]:
from keras.preprocessing import image

# Model was trained using images of this size
image_size = (180,180)

def load_image(img_path):
    img = image.load_img(img_path, target_size = image_size)
    x = image.img_to_array(img)
    x /= 255
    return x

In [44]:
import os
import numpy as np

home_dir = os.getcwd()
fname = os.path.join(home_dir, 'targets.csv') # targets for both train and validation

f = open(fname)
data = f.read()
f.close()

lines = data.split('\n')
header = lines[0].split(',')
lines = lines[1:]
lines = lines[:-1]

ids = [line.split(',')[1] for line in lines]
whale_ids = set(ids) # convert to set to remove duplicats
whale_ids = list(whale_ids) # convert back to list to make it ordered
whale_ids.remove('new_whale')
whale_ids = sorted(whale_ids)
ids_count = len(whale_ids)

print(header)
print(len(lines))
print(ids_count)

['Image', 'Id']
9850
4250


Extracting baseline info. Get the top 5 most frequently occuring ids in train data

In [66]:
from collections import Counter

top_5 = Counter(ids).most_common(5)
top_5_base = []
for i in range(5):
    t = top_5[i][0], top_5[i][1]/len(lines)
    top_5_base.append(t)

top_5_base
p_new_whale = top_5_base[0][1]

## Predicting labels

In [50]:
from keras.applications import Xception
image_size = (180,180) #adjustable parameter for processed image_size. Run time should 

conv_base = Xception(weights='imagenet',
                  include_top=False,
                  input_shape=(image_size[0], image_size[1], 3))

In [63]:
batch_size = 10

def predict_labels(augmentation):
    # augmentation parameter is true if model was trained in the train_aug.ipynb, otherwise false
    labels = np.zeros(shape=(test_count,ids_count))
    for i in range(1561):
        batch = np.zeros(shape=(batch_size,180,180,3))
        for j in range(batch_size):
            batch[j] = load_image(os.path.join(test_dir,test_list[i*batch_size + j]))
        if augmentation:
            labels_batch = model.predict(batch)
        else:  
            features_batch = conv_base.predict(batch)
            features_batch = np.reshape(features_batch,(batch_size,6*6*2048))
            labels_batch = model.predict(features_batch)
        labels[i * batch_size : (i + 1) * batch_size] = labels_batch
    return labels
            
        

In [64]:
label_prediction = predict_labels(False)

In [65]:
f = open('predictions/label_prediction_0.txt','w')
f.write(str(label_prediction))
f.close()

In [22]:
# This function takes in a label vector, and returns the top 5 whale ids in order
def get_ids(label_vec):
    ids = ''
    vec = label_vec
    for i in range(5):
        max_value = max(vec)
        max_index = np.where(vec==max_value)
        ids += whale_ids[max_index[0][0]]
        vec[max_index] = -1
        if i is not 4:
            ids += ' '
    return ids

In [None]:
# This function takes in a label vector, and returns the top 5 whale ids in order, and guesses 
p_new = p_new_whale
def get_ids_or_new_whale(label_vec):
    ids = ''
    vec = label_vec
    new_whale_guessed = False
    for i in range(5):
        max_value = max(vec)
        if max_value < p_new and new_whale_guessed != True:
            ids += 'new_whale'
            new_whale_guessed = True
        else:
            max_index = np.where(vec==max_value)
            ids += whale_ids[max_index[0][0]]
            vec[max_index] = -1
        if i is not 4:
            ids += ' '
    return ids

In [37]:
def top_5_prediction(label_vec):
    top_5_pred = []
    vec = label_vec
    for i in range(5):
        max_value = max(vec)
        max_index = np.where(vec==max_value)
        max_id = whale_ids[max_index[0][0]]
        p = max_id, max_value
        top_5_pred.append(p)
        vec[max_index] = -1
    return top_5_pred
    

In [62]:
top_5_prediction(label_prediction[21])

[('w_98baff9', 0.043619692325592041),
 ('w_74adf0b', 0.03126254677772522),
 ('w_97f5054', 0.030560841783881187),
 ('w_ddbf533', 0.019387600943446159),
 ('w_cef690d', 0.012810902670025826)]

In [60]:
for i in range(100):
    print(max(label_prediction[i]))

0.790890276432
0.188661500812
0.0436196923256
0.999880671501
0.130795732141
0.35473921895
0.333799034357
0.730777025223
0.374759882689
0.253903359175
0.17209880054
0.225434482098
0.178572177887
0.999999761581
0.0816478431225
0.196595430374
0.786136865616
0.268524706364
0.254081189632
0.770085811615
0.0685156732798
0.0289645101875
0.127820253372
0.040358517319
0.647751927376
0.0687883496284
0.98959094286
0.890912234783
0.513904511929
0.0572819747031
0.497076123953
0.246196120977
0.191147491336
0.0632137134671
0.0425635203719
0.205020770431
0.0805441886187
0.376503765583
0.252388656139
0.934563457966
0.140293240547
0.164337962866
0.143307521939
0.341555595398
0.177260652184
0.0405299626291
0.0528962612152
0.212522774935
0.414223372936
0.0446017980576
0.432242006063
0.161375388503
0.0686145871878
0.26704621315
0.149836808443
0.102045580745
0.477504223585
0.137165606022
0.15158714354
0.159041211009
0.0450534895062
0.820578098297
0.269502192736
0.562408447266
0.0903078615665
0.997298896313


In [25]:
prediction = 'Image,Id\n'
for i in range(15610):
    prediction += test_list[i]
    prediction += ', '
    prediction += get_ids(label_prediction[i])
    prediction += '\n'

#print(prediction)

In [26]:
f = open('predictions/prediction_2.csv','w')
f.write(prediction)
f.close()

In [None]:
baseline = 'new_whale w_1287fbc w_98baff9 w_7554f44 w_1eafe46'