Check GPU

In [1]:
eager_exec = False

In [2]:
import tensorflow as tf
if eager_exec:
    tf.enable_eager_execution()
    
import os
loglevel = '2'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = loglevel

print('using tf version', tf.__version__, 'with log level', loglevel)

using tf version 1.10.1 with log level 2


In [3]:
from keras import backend as K
K.tensorflow_backend._get_available_gpus()

Using TensorFlow backend.


['/job:localhost/replica:0/task:0/device:GPU:0']

Imports

In [4]:
import sys, os, csv
from urllib import request, error
from PIL import Image
from io import BytesIO
import boto3
import random
import base64

In [5]:
import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow
%matplotlib inline
import matplotlib.image as mpimg
import numpy as np
import io
from io import BytesIO

import time
import tempfile
import pickle
import math
from random import shuffle
import multiprocessing
from multiprocessing import Process, Manager

In [6]:
from tensorflow import keras
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras import applications
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation, Flatten
from tensorflow.keras.layers import Conv2D, MaxPooling2D
from tensorflow.keras.callbacks import Callback
from tensorflow.keras.utils import Sequence
from tensorflow.data import TFRecordDataset
import tensorflow.contrib.eager as tfe

from skimage.transform import resize

In [7]:
print('Using tensorflow', tf.__version__)
print('Using keras', keras.__version__)

Using tensorflow 1.10.1
Using keras 2.1.6-tf


Constants

In [8]:
# set constants
model_dir = '../models'
tfrecord_dir = 'tf_data'

batch_size = 64
tfrecord_batch_size = 500
num_classes = 14950

height = 100
width = 100
color_mode = 'rgb'
depth = 3 if color_mode == 'rgb' else 1

n_cpus = multiprocessing.cpu_count()
n_workers = n_cpus - 1 # None defaults to n_cpus
print('There are', n_cpus, 'cpu cores available')

save_model_weights = False

There are 4 cpu cores available


Setup

In [9]:
s3 = boto3.resource('s3')
bucket_name = 'landmark-data-12345'
bucket = s3.Bucket(bucket_name)

In [10]:
prefix = 's3://landmark-data-12345/'
filenames = [ prefix + obj.key for obj in bucket.objects.filter(Prefix=tfrecord_dir).all() if obj.key.endswith('tfrecord') ]

# filenames = filenames[:1]
print('there are', len(filenames), 'tfrecord files')
print('for example:', filenames[0])

there are 2638 tfrecord files
for example: s3://landmark-data-12345/tf_data/1.tfrecord


Model

In [11]:
# create model
pre_model = applications.VGG19(weights="imagenet", 
                           include_top=False, 
                           input_shape=(height, width, depth))

model = Sequential()
for idx, layer in enumerate(pre_model._layers):
    layer.trainable = False
    model.add(layer)
    
model.add(Flatten(input_shape=pre_model.output_shape[1:]))
model.add(Dense(512, activation='relu'))
model.add(Dense(512, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))

In [12]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
block1_conv1 (Conv2D)        (None, 100, 100, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 100, 100, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 50, 50, 64)        0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 50, 50, 128)       73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 50, 50, 128)       147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 25, 25, 128)       0         
_________________________________________________________________
block3_conv1 (Conv2D)        (None, 25, 25, 256)       295168    
__________

In [13]:
# compile model
if not eager_exec:
    model.compile(loss='categorical_crossentropy',
                 optimizer='adam',
                 metrics=['accuracy', 'categorical_accuracy'])

Train

In [14]:
# tell model to save after each epoch
class SaveEachEpoch(Callback):
    def on_epoch_end(self, epoch, logs={}):
        if save_model_weights:
            epoch = epoch + 1
            print('SEE: saving model for epoch', epoch)

            filename1 = '/small_landmark_model_' + str(epoch) + '.h5'
            filename2 = '/small_landmark_model_weights_' + str(epoch) + '.h5'

            source1 = model_dir+filename1
            dest1 = 'models'+filename1
            source2 = model_dir+filename2
            dest2 = 'models'+filename2

            try:
                self.model.save(source1)
                bucket.upload_file(source1, dest1)

                self.model.save_weights(source2)
                bucket.upload_file(source2, dest2)

            except:
                print('SEE: error saving model')
                return

            print('SEE: done saving model')

# numpy arrays dataset

In [15]:
pred_batches = []
pred_batch = []
labels = []
num_records = 0

# for idx, filename in enumerate(filenames):
#     print(idx+1, 'of', len(filenames), 'tfrecords')
#     record_iterator = tf.python_io.tf_record_iterator(path=filename)
    
#     for idx2, record_string in enumerate(record_iterator):
#         num_records += 1
#         if len(pred_batch) == batch_size:
#             pred_batches.append(pred_batch)
#             pred_batch = []
#             print('finished batch', len(pred_batches))

#         example = tf.train.Example()
#         example.ParseFromString(record_string)

#         img_byte_string = example.features.feature['image'].bytes_list.value[0]
#         shape = example.features.feature['shape'].int64_list.value    
#         if len(shape) != 3:
#             print('shape is', shape, ', skipping')
#             continue

#         tempBuff = io.BytesIO()
#         tempBuff.write(img_byte_string)
#         tempBuff.seek(0)

#         image_data = Image.open(tempBuff).convert("RGB")
#         image_data = np.array(image_data).reshape(shape)
#         image_data = resize(image_data, (height, width, depth), anti_aliasing=True)
        
#         pred_batch.append(image_data)
        
#         label = example.features.feature['label'].int64_list.value[0]
#         labels.append(label)
        
# if len(pred_batch) > 0:
#     pred_batches.append(pred_batch)
#     pred_batch = []
    
# plt.figure()
# imshow(pred_batches[0][0])

# print('processed', num_records, 'records')
# print(len(pred_batches), 'batches of size', batch_size)

In [16]:
x = []
for pred_batch in pred_batches:
    for image_data in pred_batch:
        x.append(image_data)
        
y = []
for label in labels:
    one_hot = np.zeros((num_classes))
    one_hot[label] = 1 # starts at 0
    y.append(one_hot)

In [17]:
print(np.array(x).shape)
# print(x)

(0,)


In [18]:
print(np.array(y).shape)
# print(y)

(0,)


# tfdataset

In [19]:
def filter_func(tensor):
    features = {
        'shape': tf.FixedLenFeature([], dtype=tf.int64)
    }
    example = tf.parse_single_example(tensor, features)

    return tf.not_equal(tf.size(example['shape']), 0)

def custom_decode_image(tensor):
    return tf.cast(tf.image.decode_jpeg(tensor, channels=depth), tf.float32)

def x_preprocess_fn(tensor):
    features = {
        'label': tf.FixedLenFeature([], dtype=tf.int64),
        'image': tf.FixedLenFeature([], dtype=tf.string),
    }
    example = tf.parse_single_example(tensor, features)

    x = custom_decode_image(example['image'])
    
    x = tf.multiply(x, tf.constant(1.0 / 255, dtype=tf.float32))

    return x

def resize_fn(tensor):
    x = tf.image.resize_bilinear(tensor, (height, width), align_corners=False)
#     x = tf.image.resize_nearest_neighbor(tensor, (height, width), align_corners=False)
#     x = tf.image.resize_area(tensor, (height, width), align_corners=False)
#     x = tf.image.resize_image_with_pad(tensor, height, width)
    
    return x

def y_preprocess_fn(tensor):
    features = {
        'label': tf.FixedLenFeature([], dtype=tf.int64),
        'image': tf.FixedLenFeature([], dtype=tf.string),
    }
    example = tf.parse_single_example(tensor, features)

    y = tf.one_hot(tf.cast(example['label'], tf.int32), num_classes)
    
    return y


predataset = TFRecordDataset(filenames).filter(filter_func)

xdataset = predataset.map(x_preprocess_fn).batch(1).map(resize_fn).flat_map(lambda x: tf.data.TFRecordDataset.from_tensor_slices(x)).batch(batch_size)

ydataset = predataset.map(y_preprocess_fn).batch(batch_size)

dataset = tf.data.TFRecordDataset.zip((xdataset, ydataset)).repeat(10)

# if eager_exec:
#     for y in ydataset.make_one_shot_iterator():
#         print(np.argmax(y.numpy()))

# if eager_exec:
#     for x,y in dataset.take(10).make_one_shot_iterator():
#         print(x.numpy()[0])
#         plt.figure()
#         imshow(x.numpy()[0])

# train

In [21]:
# train model
n_epochs = 10

# print('training on', len(x), 'examples for', n_epochs, 'epochs with batch size', batch_size)
# model.fit(x=np.array(x),
#           y=np.array(y),
#           epochs=n_epochs,
#           batch_size=batch_size,
#           verbose=1,
#           callbacks=[SaveEachEpoch()]
#          )

num_examples = len(filenames) * tfrecord_batch_size
steps_per_epoch = num_examples // batch_size
if num_examples % batch_size > 0:
    steps_per_epoch += 1

print('training for', n_epochs, 'epochs with batch size', batch_size, 'for', steps_per_epoch, 'steps per epoch')
model.fit(dataset.make_one_shot_iterator(),
          epochs=n_epochs,
          steps_per_epoch=steps_per_epoch,
          callbacks=[SaveEachEpoch()],
          verbose=1
         )

training for 10 epochs with batch size 1 for 1319000 steps per epoch
Epoch 1/10
    906/1319000 [..............................] - ETA: 40:15:27 - loss: 8.8175 - acc: 0.0430 - categorical_accuracy: 0.0430

KeyboardInterrupt: 

In [22]:
softmax_preds = model.predict(np.array(x))
preds = []
for i, pred in enumerate(softmax_preds):
    pred_class = np.argmax(pred)
    confidence = np.max(pred)
    preds.append(pred_class)

In [23]:
n_correct = 0
for idx, pred in enumerate(preds):
    if pred == labels[idx]:
        n_correct += 1
        
print('accuracy:', float(n_correct)/len(x))

accuracy: 0.6323232323232323


In [24]:
print(preds)

[6051, 1281, 1157, 1904, 14912, 14669, 7505, 9633, 10067, 8495, 1553, 9779, 2622, 4645, 2311, 3301, 9193, 9201, 8598, 9633, 3550, 14931, 3804, 9633, 4827, 623, 14565, 9633, 6599, 9738, 5090, 9633, 9633, 4981, 5554, 11010, 4735, 6051, 6051, 11425, 9633, 9633, 9633, 8429, 9633, 9633, 9633, 2963, 12220, 6599, 12142, 9633, 12533, 9633, 9633, 9633, 6642, 1828, 1946, 3504, 2665, 6051, 9633, 14873, 8143, 12829, 5260, 9633, 2743, 9633, 9633, 2996, 3034, 10932, 5574, 9633, 13208, 5661, 13773, 10567, 10834, 8090, 5376, 9633, 6599, 9335, 9633, 7700, 8169, 14565, 4981, 9633, 9633, 6091, 7761, 8274, 8487, 10932, 9633, 823, 6051, 5946, 9633, 9633, 6599, 6091, 5635, 11058, 6051, 9633, 6051, 6051, 10075, 12609, 11424, 1376, 2870, 3550, 4827, 9633, 9633, 10757, 2885, 8705, 7150, 7942, 7218, 9779, 428, 9633, 3238, 7083, 369, 9633, 2743, 6599, 960, 9633, 11864, 9334, 2429, 11475, 5506, 5376, 5864, 2209, 9633, 2743, 9633, 5554, 14378, 5380, 4645, 9633, 5376, 2842, 12718, 9779, 8274, 7262, 12866, 5794, 963

In [25]:
print(labels)

[6103, 1281, 1157, 1904, 3495, 1383, 1472, 3283, 14860, 6599, 10378, 9779, 2622, 4645, 2311, 3301, 9193, 9201, 8598, 9633, 6696, 14931, 3804, 8063, 4827, 623, 12172, 9633, 3695, 9738, 5090, 6696, 2061, 10653, 1140, 11010, 4735, 9638, 5506, 11425, 9633, 4352, 5714, 5880, 3137, 6846, 9633, 2963, 12220, 6347, 12142, 10368, 12533, 3924, 12220, 12992, 6642, 1828, 1946, 3504, 2665, 5259, 11536, 14873, 8143, 12829, 12514, 4522, 2743, 5554, 9633, 3736, 3034, 10932, 11643, 7096, 13208, 9479, 13773, 10567, 10834, 8090, 5376, 2339, 6231, 7013, 6008, 7700, 8169, 11037, 11815, 14562, 14104, 9779, 7761, 4352, 8487, 10932, 9633, 823, 6051, 2044, 11755, 13873, 6599, 9779, 5635, 11058, 6051, 6696, 6651, 1213, 13926, 12950, 11424, 1376, 2870, 3550, 3609, 12676, 4085, 10757, 2885, 8705, 7150, 7942, 7218, 7476, 14386, 428, 3238, 7083, 369, 9633, 2743, 9571, 14079, 14544, 9335, 9334, 6696, 11475, 5506, 5376, 5864, 2209, 8501, 7730, 9633, 5554, 14378, 5380, 7922, 8063, 3518, 2842, 12718, 9779, 6112, 7262, 1

In [34]:
print(np.min(np.array(x)[0]))
print(np.min(np.array(x)[0]))

0.0196078431372549
0.0196078431372549
