Check GPU

In [1]:
from keras import backend as K
K.tensorflow_backend._get_available_gpus()

Using TensorFlow backend.


['/job:localhost/replica:0/task:0/device:GPU:0']

Imports

In [2]:
import sys, os, csv
from urllib import request, error
from PIL import Image
from io import BytesIO
import boto3
import random
import pandas as pd
import time

In [3]:
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import numpy as np
import io

import tempfile
import pickle
from random import shuffle

In [4]:
from tensorflow import keras
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras import applications
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten
from tensorflow.keras.layers import Conv2D, MaxPooling2D
from tensorflow.keras.callbacks import Callback

from skimage.transform import resize

Constants

In [5]:
# set constants
model_dir = '../models'

color_mode = 'rgb'

batch_size = 32

height = 100
width = 100
depth = 3 if color_mode == 'rgb' else 1

n_layers_to_tune = 0

Setup

In [6]:
s3 = boto3.resource('s3')
bucket_name = 'landmark-data-12345'
bucket = s3.Bucket(bucket_name)

In [7]:
source = 'data/sample_submission.csv'
dest = '../data/sample_submission.csv'

if not os.path.isfile(dest):
    bucket.download_file(source, dest)

sample_sub = pd.read_csv(dest)

In [8]:
filekeys = []

if os.path.isfile('pickles/test_filekeys'):
    print('loading data from files')
    
    file = open('pickles/test_filekeys', 'rb')
    filekeys = pickle.load(file)
    file.close()
    
else:
    print('data pickles dont exist, generating')
    objects = bucket.objects.filter(Prefix="data/test/")
    for o in objects:
        if o.key.endswith('.jpg'):
            filekeys.append(o.key)

    file = open('pickles/test_filekeys', 'wb')
    pickle.dump(filekeys, file)
    file.close()

print('data loaded')

loading data from files
data loaded


In [9]:
num_filekeys = len(filekeys)
print('there are', num_filekeys, 'test images')

there are 113980 test images


Helper funcs

In [10]:
def load_s3_file(filekey):
    object = bucket.Object(filekey)
    tmp = tempfile.NamedTemporaryFile()

    with open(tmp.name, 'wb') as f:
        object.download_fileobj(f)
        img = mpimg.imread(tmp.name)
        return np.array(img)

In [11]:
def process_img(img):
    try:
        processed_img = np.divide(img, 255.0)
        processed_img = resize(img, (height, width, depth), mode='reflect', anti_aliasing=True)
        return processed_img
    except:
        return np.zeros((height, width, depth))

Model

In [12]:
# create model
pre_model = applications.VGG19(weights="imagenet", 
                           include_top=False, 
                           input_shape=(width, height, 3 if color_mode == 'rgb' else 1))

model = Sequential()
for idx, layer in enumerate(pre_model._layers):
    if idx < len(pre_model._layers) - n_layers_to_tune:
        layer.trainable = False
    model.add(layer)
    
model.add(Flatten(input_shape=pre_model.output_shape[1:]))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(14940, activation='softmax'))

In [13]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
block1_conv1 (Conv2D)        (None, 100, 100, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 100, 100, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 50, 50, 64)        0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 50, 50, 128)       73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 50, 50, 128)       147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 25, 25, 128)       0         
_________________________________________________________________
block3_conv1 (Conv2D)        (None, 25, 25, 256)       295168    
__________

Load model

In [14]:
# download weights
num_batch_to_load = 4
source = 'models/landmark_model_weights' + str(num_batch_to_load) + '.h5'
dest = '../models/landmark_model_weights' + str(num_batch_to_load) + '.h5'

if not os.path.isfile(dest):
    bucket.download_file(source, dest)

model.load_weights(dest)

In [15]:
# load pruned classes to turn pruned prediction into normal prediction
file = open('pickles/pruned_classes', 'rb')
pruned_classes = pickle.load(file)
file.close()

In [None]:
# load training data
preds = []
idx = 0
sub_idx = 0

start = time.time()
while idx < num_filekeys:
    print('processing img',idx,'after',time.time()-start,'seconds')

    pred_batch = []
    batch_ids = []
    while len(pred_batch) < batch_size*4:
        if idx < num_filekeys:

            filekey = filekeys[idx]
            img_id = filekey.split('/')[2].split('.')[0] # get filename, minus jpg
#             print('img id is',img_id)
            curr_sub_id = sample_sub.iloc[sub_idx]['id']
            while curr_sub_id != img_id:
                print('but sub id is',curr_sub_id)
                batch_ids.append(curr_sub_id)
                pred_batch.append(np.zeros((height, width, depth)))
                sub_idx += 1
                curr_sub_id = sample_sub.iloc[sub_idx]['id']
            
            batch_ids.append(img_id)

            img = load_s3_file(filekey)
            processed_img = process_img(img)
            pred_batch.append(processed_img)
            idx += 1
            sub_idx += 1

    predictions = model.predict_on_batch(np.array(pred_batch))
    for i, prediction in enumerate(predictions):
        pred_pruned_class = np.argmax(prediction)
        pred_class = pruned_classes[pred_pruned_class]
        confidence = np.max(prediction)
        preds.append((batch_ids[i], pred_class, confidence))

processing img 0 after 0.0002486705780029297 seconds
processing img 121 after 50.73480296134949 seconds
processing img 245 after 101.51376700401306 seconds
processing img 373 after 148.15084385871887 seconds
processing img 497 after 198.0821497440338 seconds
processing img 624 after 247.4419093132019 seconds
processing img 749 after 297.3329689502716 seconds
processing img 873 after 346.7143979072571 seconds
processing img 999 after 393.9162976741791 seconds
processing img 1123 after 447.58734107017517 seconds
processing img 1246 after 493.74377369880676 seconds
processing img 1371 after 540.2479226589203 seconds
processing img 1495 after 586.9122774600983 seconds
processing img 1618 after 635.558272600174 seconds


In [None]:
print(len(preds), 'preds') # should be 117703

In [None]:
# save to csv
df = pd.DataFrame()
df['id'] = pd.Series([ pred[0] for pred in preds ])
df['landmarks'] = pd.Series([ (str(pred[1]) + ' ' + str(pred[2])) for pred in preds ])
df.to_csv('pickles/submission.csv', index=False)