Check GPU

In [1]:
from keras import backend as K
K.tensorflow_backend._get_available_gpus()

Using TensorFlow backend.


['/job:localhost/replica:0/task:0/device:GPU:0']

# INIT

Imports

In [2]:
import cv2

In [3]:
import sys, os, csv
from urllib import request, error
from PIL import Image
from io import BytesIO
import boto3
import random
import pandas as pd
import time

In [4]:
import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow
%matplotlib inline
import matplotlib.image as mpimg
import numpy as np
import io
from io import BytesIO

import tempfile
import pickle
from random import shuffle

In [5]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras import applications
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten
from tensorflow.keras.layers import Conv2D, MaxPooling2D
from tensorflow.keras.callbacks import Callback

from skimage.transform import resize

Constants

In [6]:
# set constants
model_dir = '../models'
tfrecord_dir = 'tf_data_test'

should_load_from_s3 = True

batch_size = 64

tfrecord_batch_size = 500
num_classes = 14940

height = 100
width = 100
color_mode = 'rgb'
depth = 3 if color_mode == 'rgb' else 1

n_layers_to_tune = 0

Setup

In [7]:
s3 = boto3.resource('s3')
bucket_name = 'landmark-data-12345'
bucket = s3.Bucket(bucket_name)

In [8]:
source = 'data/sample_submission.csv'
dest = '../data/sample_submission.csv'

if not os.path.isfile(dest):
    bucket.download_file(source, dest)

sample_sub = pd.read_csv(dest)

In [9]:
if should_load_from_s3:
    prefix = 's3://landmark-data-12345/'
    filenames = [ prefix + obj.key for obj in bucket.objects.filter(Prefix=tfrecord_dir).all() if obj.key.endswith('tfrecord') ]
else:
    filenames = [ tfrecord_dir+filename for filename in os.listdir(tfrecord_dir) if filename.endswith('tfrecord') ]

print('there are', len(filenames), 'tfrecord files')
print('for example:', filenames[0])

there are 228 tfrecord files
for example: s3://landmark-data-12345/tf_data_test/tfrecord_temp/1.tfrecord


In [10]:
pred_batches = []
id_batches = [] # correlate to batches

In [13]:
# with jpeg string
from tensorflow.keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img

pred_batch = []
id_batch = []
num_records = 0

for idx, filename in enumerate(filenames):
    print(idx+1, 'of', len(filenames), 'tfrecords')
    record_iterator = tf.python_io.tf_record_iterator(path=filename)

    for record_string in record_iterator:
        num_records += 1
        if len(pred_batch) == batch_size:
            pred_batches.append(pred_batch)
            pred_batch = []
            id_batches.append(id_batch)
            id_batch = []

        example = tf.train.Example()
        example.ParseFromString(record_string)

        img_byte_string = example.features.feature['image'].bytes_list.value[0]
        shape = example.features.feature['shape'].int64_list.value    
        if len(shape) != 3:
            print(shape)
            continue

        tempBuff = io.BytesIO()
        tempBuff.write(img_byte_string)
        tempBuff.seek(0)

        image_data = Image.open(tempBuff).convert("RGB")
        image_data = np.array(image_data).reshape(shape)

        id_int_list = example.features.feature['id'].int64_list.value

        # add leading zeros to pad to num_char_per_hex
        num_char_per_hex = 16//len(id_int_list)
        id_string = ''.join([ "{0:0{1}x}".format(id_int, num_char_per_hex) for id_int in id_int_list ])

        decoded_image = cv2.imdecode(np.frombuffer(img_byte_string, np.uint8), -1)
        decoded_image = example.features.feature['image']
        pred_batch.append(decoded_image)
        id_batch.append(id_string)       

if len(pred_batch) > 0:
    pred_batches.append(pred_batch)
    pred_batch = []
    id_batches.append(id_batch)
    id_batch = []
    
print('processed', num_records, 'records')

1 of 228 tfrecords
2 of 228 tfrecords


KeyboardInterrupt: 

In [15]:
# resize the images 
print(len(pred_batch))

131886


In [None]:
def preprocess_fn(tensor):
    features = {
        'label': tf.FixedLenFeature([], dtype=tf.int64),
        'image': tf.FixedLenFeature([], dtype=tf.string),
#         'shape': tf.FixedLenFeature([], dtype=tf.int64)
        'id': tf.FixedLenFeature([], dtype=tf.int64)
    }
    example = tf.parse_single_example(tensor, features)

    image_data = tf.image.decode_jpeg(example['image'], channels=depth)
    
    x = tf.image.resize_image_with_pad(tf.cast(image_data, tf.float32), height, width)
    
    y = tf.one_hot(tf.cast(example['label'], tf.int32), num_classes)
    
    # todo: return id here?
    return x, y

dataset = TFRecordDataset(filenames)

dataset = dataset.apply(tf.contrib.data.map_and_batch(
    preprocess_fn, batch_size,
    num_parallel_batches=n_workers))

dataset = dataset.repeat()
dataset = dataset.prefetch(tf.contrib.data.AUTOTUNE) # todo: test without prefetch

# Model

In [12]:
# create model
pre_model = applications.VGG19(weights="imagenet", 
                           include_top=False, 
                           input_shape=(width, height, 3 if color_mode == 'rgb' else 1))

model = Sequential()
for idx, layer in enumerate(pre_model._layers):
    if idx < len(pre_model._layers) - n_layers_to_tune:
        layer.trainable = False
    model.add(layer)
    
model.add(Flatten(input_shape=pre_model.output_shape[1:]))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(num_classes, activation='softmax'))

In [13]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
block1_conv1 (Conv2D)        (None, 100, 100, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 100, 100, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 50, 50, 64)        0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 50, 50, 128)       73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 50, 50, 128)       147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 25, 25, 128)       0         
_________________________________________________________________
block3_conv1 (Conv2D)        (None, 25, 25, 256)       295168    
__________

Load model

In [14]:
# download weights
num_batch_to_load = 4
source = 'models/landmark_model_weights' + str(num_batch_to_load) + '.h5'
dest = '../models/landmark_model_weights' + str(num_batch_to_load) + '.h5'

if not os.path.isfile(dest):
    bucket.download_file(source, dest)

model.load_weights(dest)

# PREDICT

In [None]:
# model.fit(test_dataset.make_one_shot_iterator(),
#             validation_data=validation_dataset.make_one_shot_iterator(),
#             epochs=5, 
#             steps_per_epoch=steps_per_epoch,
#             verbose=1,
#             callbacks=[SaveEachEpoch()]
#             )

In [None]:
# load training data
preds = []
idx = 0
sub_idx = 0

start = time.time()
while idx < num_filekeys:
    print('processing img',idx,'after',time.time()-start,'seconds')

    pred_batch = []
    batch_ids = []
    while len(pred_batch) < batch_size*4:
        if idx < num_filekeys:

            filekey = filekeys[idx]
            img_id = filekey.split('/')[2].split('.')[0] # get filename, minus jpg
#             print('img id is',img_id)
            curr_sub_id = sample_sub.iloc[sub_idx]['id']
            while curr_sub_id != img_id:
                print('but sub id is',curr_sub_id)
                batch_ids.append(curr_sub_id)
                pred_batch.append(np.zeros((height, width, depth)))
                sub_idx += 1
                curr_sub_id = sample_sub.iloc[sub_idx]['id']
            
            batch_ids.append(img_id)

            img = load_s3_file(filekey)
            processed_img = process_img(img)
            pred_batch.append(processed_img)
            idx += 1
            sub_idx += 1

    predictions = model.predict_on_batch(np.array(pred_batch))
    
    for i, prediction in enumerate(predictions):
        pred_pruned_class = np.argmax(prediction)
        pred_class = pruned_classes[pred_pruned_class]
        confidence = np.max(prediction)
        preds.append((batch_ids[i], pred_class, confidence))

processing img 0 after 0.0002486705780029297 seconds
processing img 121 after 50.73480296134949 seconds
processing img 245 after 101.51376700401306 seconds
processing img 373 after 148.15084385871887 seconds
processing img 497 after 198.0821497440338 seconds
processing img 624 after 247.4419093132019 seconds
processing img 749 after 297.3329689502716 seconds
processing img 873 after 346.7143979072571 seconds
processing img 999 after 393.9162976741791 seconds
processing img 1123 after 447.58734107017517 seconds
processing img 1246 after 493.74377369880676 seconds
processing img 1371 after 540.2479226589203 seconds
processing img 1495 after 586.9122774600983 seconds
processing img 1618 after 635.558272600174 seconds


In [None]:
print(len(preds), 'preds') # should be 117703

In [None]:
# save to csv
df = pd.DataFrame()
df['id'] = pd.Series([ pred[0] for pred in preds ])
df['landmarks'] = pd.Series([ (str(pred[1]) + ' ' + str(pred[2])) for pred in preds ])
df.to_csv('pickles/submission.csv', index=False)