In [1]:
import argparse
import logging
import db.mariadb as mariadb
import os
import MySQLdb

import imageio
import numpy as np

import keras
from keras.preprocessing import image

from keras.applications.resnet50 import preprocess_input
from requests_futures.sessions import FuturesSession
from PIL.Image import DecompressionBombError

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logging.basicConfig(format='%(asctime)s-%(levelname)s-%(name)s - %(message)s')


def load_model():
    model = keras.applications.resnet50.ResNet50(include_top=False, pooling='avg')
    # model.summary()
    return model


model = load_model()


Using TensorFlow backend.


In [2]:
# Change according to your database setup (these are the defaults, see .env file)

con = mariadb.get_connection("127.0.0.1", 3308, "image_processing", "user", "user_pw")

files_batch = mariadb.get_files_data(0, 800000, con)

con.close()

# Use every 100th image of the 800000
files_batch = files_batch[::100]
features_train = []
features_test = []

features_train_info = []
features_test_info = []

# Create training and test features
counter = 1
for (img_id, name, path, url) in files_batch:
    try:
        img = image.load_img(path, target_size=(224, 224))
        # print(type(img))
        img_data = image.img_to_array(img)
        # print(type(img_data))
        # print(img_data.shape)
        img_data = np.expand_dims(img_data, axis=0)
        # print(img_data.shape)
        img_data = preprocess_input(img_data)
        # print(img_data.shape)
        res_net_feature = model.predict(img_data)
        # print(res_net_feature.shape)
        res_net_feature = np.array(res_net_feature).flatten()
        # print(res_net_feature.shape)

        if counter % 3 == 0:
            features_test.append(res_net_feature)
            features_test_info.append((img_id, name, path, url))
        else:
            features_train.append(res_net_feature)
            features_train_info.append((img_id, name, path, url))

        counter += 1

    except OSError as e:
        print(e)
    except DecompressionBombError as e:
        print(e)
        print(path)

features_train = np.array(features_train)
features_test = np.array(features_test)

print(features_train.shape)
print(len(features_train_info))

print(features_test.shape)
print(len(features_test_info))

cannot identify image file '/home/shohl/Bilder/idai_cloud_mount/FADatenbankabb0700c/oA_14610.jpg'
cannot identify image file '/home/shohl/Bilder/idai_cloud_mount/FADatenbankabb0700b/FA-Kae54-22_002921808,02.jpg'
cannot identify image file '/home/shohl/Bilder/idai_cloud_mount/FADatenbankabb0700d/FA996-12_0002921601,11.jpg'
cannot identify image file '/home/shohl/Bilder/idai_cloud_mount/FADatenbankabb0700d/Mal1043-01_21751,03.jpg'
cannot identify image file '/home/shohl/Bilder/idai_cloud_mount/FADatenbankabb0700d/Mal18-07_140891,01.jpg'




cannot identify image file '/home/shohl/Bilder/idai_cloud_mount/FADatenbankabb0575/D-DAI-ROM-56.927_002922407.jpg'
cannot identify image file '/home/shohl/Bilder/idai_cloud_mount/FADatenbankabb0577/D-DAI-ROM-67.16_56977.jpg'
cannot identify image file '/home/shohl/Bilder/idai_cloud_mount/FADatenbankabb0577/D-DAI-ROM-71.624_000300141352.jpg'




Image size (205393290 pixels) exceeds limit of 178956970 pixels, could be decompression bomb DOS attack.




cannot identify image file '/home/shohl/Bilder/idai_cloud_mount/dai-rom-fotothek-2003/Bestand-D-DAI-ROM-2003.0067.JPG'
cannot identify image file '/home/shohl/Bilder/idai_cloud_mount/dai-rom-fotothek-2003/Bestand-D-DAI-ROM-2003.0175.JPG'
cannot identify image file '/home/shohl/Bilder/idai_cloud_mount/dai-rom-fotothek-2003/Bestand-D-DAI-ROM-2003.0277.JPG'
cannot identify image file '/home/shohl/Bilder/idai_cloud_mount/dai-rom-fotothek-2003/Bestand-D-DAI-ROM-2003.0376.JPG'
cannot identify image file '/home/shohl/Bilder/idai_cloud_mount/dai-rom-fotothek-2003/Bestand-D-DAI-ROM-2003.0594.JPG'
cannot identify image file '/home/shohl/Bilder/idai_cloud_mount/dai-rom-fotothek-2003/Bestand-D-DAI-ROM-2003.0694.JPG'
cannot identify image file '/home/shohl/Bilder/idai_cloud_mount/dai-rom-fotothek-2003/Bestand-D-DAI-ROM-2003.0794.JPG'
(5323, 2048)
5323
(2661, 2048)
2661


In [3]:
print(np.max(features_train))
print(np.max(features_test))

# TODO: Evaluate which set has higher max value and select higher
features_train_scaled = features_train / np.max(features_train)
features_test_scaled = features_test / np.max(features_train)

print(np.max(features_train_scaled))
print(np.max(features_test_scaled))

29.489676
21.073988


In [4]:
from sklearn.neighbors import NearestNeighbors
neigh = NearestNeighbors(n_neighbors=10)

result = neigh.fit(features_train_scaled)

neighbours = result.kneighbors()

In [5]:
print(len(neighbours))
print(len(neighbours[0]))
print(neighbours[0])
print(len(neighbours[1]))
print(neighbours[1])

2
5323
[[0.8869949  0.95451961 0.96011991 ... 0.9791909  0.98546634 0.9955557 ]
 [1.26883223 1.30483152 1.3814718  ... 1.44745823 1.44969341 1.45190205]
 [1.1523444  1.19131711 1.20262143 ... 1.2408876  1.2468743  1.2603429 ]
 ...
 [1.03671313 1.03765333 1.06060254 ... 1.08386614 1.08407201 1.08443034]
 [1.06930514 1.07236736 1.08549986 ... 1.11641227 1.11959069 1.12072677]
 [0.74386297 0.77322731 0.79062213 ... 0.83160336 0.83692799 0.84112598]]
5323
[[1821 2854 2761 ... 3762 2723 2719]
 [ 721 2822 3091 ...  208 4134 1022]
 [ 184  204 1096 ... 1089 2822 3232]
 ...
 [4274 2145 4962 ... 1395 2131 3367]
 [5193 1383 4025 ... 1060 4137 1394]
 [5192 5314  160 ... 4712 5181 1056]]


In [36]:
# Initialize and run autoencoder

from keras.models import Sequential, Model
from keras.layers import Dense
from keras.optimizers import Adam, Adadelta
from keras import regularizers
import keras.backend as K

def euclidean_distance_loss(y_true, y_pred):
    """
    Euclidean distance loss
    https://en.wikipedia.org/wiki/Euclidean_distance
    :param y_true: TensorFlow/Theano tensor
    :param y_pred: TensorFlow/Theano tensor of the same shape as y_true
    :return: float
    """
    return K.sqrt(K.sum(K.square(y_pred - y_true), axis=-1))


m = Sequential()

m.add(Dense(512, activation='elu', input_shape=(features_train_scaled.shape[1],)))
m.add(Dense(32, activation='linear', name="bottleneck"))
m.add(Dense(512, activation='elu'))
m.add(Dense(features_train.shape[1], activation='sigmoid'))

m.compile(loss=euclidean_distance_loss, optimizer=Adadelta())

history = m.fit(features_train_scaled, features_train_scaled, batch_size=128, epochs=300, verbose=1,
                validation_data=(features_test_scaled, features_test_scaled))

encoder = Model(m.input, m.get_layer('bottleneck').output)
enc = encoder.predict(features_train_scaled) # returns the encoded values (32 floats instead of 2048)

Train on 5323 samples, validate on 2661 samples
Epoch 1/300
Epoch 2/300
Epoch 3/300
Epoch 4/300
Epoch 5/300
Epoch 6/300
Epoch 7/300
Epoch 8/300
Epoch 9/300
Epoch 10/300
Epoch 11/300
Epoch 12/300
Epoch 13/300
Epoch 14/300
Epoch 15/300
Epoch 16/300
Epoch 17/300
Epoch 18/300
Epoch 19/300
Epoch 20/300
Epoch 21/300
Epoch 22/300
Epoch 23/300
Epoch 24/300
Epoch 25/300
Epoch 26/300
Epoch 27/300
Epoch 28/300
Epoch 29/300
Epoch 30/300
Epoch 31/300
Epoch 32/300
Epoch 33/300
Epoch 34/300
Epoch 35/300
Epoch 36/300
Epoch 37/300
Epoch 38/300
Epoch 39/300
Epoch 40/300
Epoch 41/300
Epoch 42/300
Epoch 43/300
Epoch 44/300
Epoch 45/300
Epoch 46/300
Epoch 47/300
Epoch 48/300
Epoch 49/300
Epoch 50/300
Epoch 51/300
Epoch 52/300
Epoch 53/300
Epoch 54/300
Epoch 55/300
Epoch 56/300
Epoch 57/300
Epoch 58/300
Epoch 59/300
Epoch 60/300
Epoch 61/300
Epoch 62/300
Epoch 63/300
Epoch 64/300
Epoch 65/300
Epoch 66/300
Epoch 67/300
Epoch 68/300
Epoch 69/300
Epoch 70/300
Epoch 71/300
Epoch 72/300
Epoch 73/300
Epoch 74/300

Epoch 78/300
Epoch 79/300
Epoch 80/300
Epoch 81/300
Epoch 82/300
Epoch 83/300
Epoch 84/300
Epoch 85/300
Epoch 86/300
Epoch 87/300
Epoch 88/300
Epoch 89/300
Epoch 90/300
Epoch 91/300
Epoch 92/300
Epoch 93/300
Epoch 94/300
Epoch 95/300
Epoch 96/300
Epoch 97/300
Epoch 98/300
Epoch 99/300
Epoch 100/300
Epoch 101/300
Epoch 102/300
Epoch 103/300
Epoch 104/300
Epoch 105/300
Epoch 106/300
Epoch 107/300
Epoch 108/300
Epoch 109/300
Epoch 110/300
Epoch 111/300
Epoch 112/300
Epoch 113/300
Epoch 114/300
Epoch 115/300
Epoch 116/300
Epoch 117/300
Epoch 118/300
Epoch 119/300
Epoch 120/300
Epoch 121/300
Epoch 122/300
Epoch 123/300
Epoch 124/300
Epoch 125/300
Epoch 126/300
Epoch 127/300
Epoch 128/300
Epoch 129/300
Epoch 130/300
Epoch 131/300
Epoch 132/300
Epoch 133/300
Epoch 134/300
Epoch 135/300
Epoch 136/300
Epoch 137/300
Epoch 138/300
Epoch 139/300
Epoch 140/300
Epoch 141/300
Epoch 142/300
Epoch 143/300
Epoch 144/300
Epoch 145/300
Epoch 146/300
Epoch 147/300
Epoch 148/300
Epoch 149/300
Epoch 150/300


Epoch 155/300
Epoch 156/300
Epoch 157/300
Epoch 158/300
Epoch 159/300
Epoch 160/300
Epoch 161/300
Epoch 162/300
Epoch 163/300
Epoch 164/300
Epoch 165/300
Epoch 166/300
Epoch 167/300
Epoch 168/300
Epoch 169/300
Epoch 170/300
Epoch 171/300
Epoch 172/300
Epoch 173/300
Epoch 174/300
Epoch 175/300
Epoch 176/300
Epoch 177/300
Epoch 178/300
Epoch 179/300
Epoch 180/300
Epoch 181/300
Epoch 182/300
Epoch 183/300
Epoch 184/300
Epoch 185/300
Epoch 186/300
Epoch 187/300
Epoch 188/300
Epoch 189/300
Epoch 190/300
Epoch 191/300
Epoch 192/300
Epoch 193/300
Epoch 194/300
Epoch 195/300
Epoch 196/300
Epoch 197/300
Epoch 198/300
Epoch 199/300
Epoch 200/300
Epoch 201/300
Epoch 202/300
Epoch 203/300
Epoch 204/300
Epoch 205/300
Epoch 206/300
Epoch 207/300
Epoch 208/300
Epoch 209/300
Epoch 210/300
Epoch 211/300
Epoch 212/300
Epoch 213/300
Epoch 214/300
Epoch 215/300
Epoch 216/300
Epoch 217/300
Epoch 218/300
Epoch 219/300
Epoch 220/300
Epoch 221/300
Epoch 222/300
Epoch 223/300
Epoch 224/300
Epoch 225/300
Epoch 

Epoch 232/300
Epoch 233/300
Epoch 234/300
Epoch 235/300
Epoch 236/300
Epoch 237/300
Epoch 238/300
Epoch 239/300
Epoch 240/300
Epoch 241/300
Epoch 242/300
Epoch 243/300
Epoch 244/300
Epoch 245/300
Epoch 246/300
Epoch 247/300
Epoch 248/300
Epoch 249/300
Epoch 250/300
Epoch 251/300
Epoch 252/300
Epoch 253/300
Epoch 254/300
Epoch 255/300
Epoch 256/300
Epoch 257/300
Epoch 258/300
Epoch 259/300
Epoch 260/300
Epoch 261/300
Epoch 262/300
Epoch 263/300
Epoch 264/300
Epoch 265/300
Epoch 266/300
Epoch 267/300
Epoch 268/300
Epoch 269/300
Epoch 270/300
Epoch 271/300
Epoch 272/300
Epoch 273/300
Epoch 274/300
Epoch 275/300
Epoch 276/300
Epoch 277/300
Epoch 278/300
Epoch 279/300
Epoch 280/300
Epoch 281/300
Epoch 282/300
Epoch 283/300
Epoch 284/300
Epoch 285/300
Epoch 286/300
Epoch 287/300
Epoch 288/300
Epoch 289/300
Epoch 290/300
Epoch 291/300
Epoch 292/300
Epoch 293/300
Epoch 294/300
Epoch 295/300
Epoch 296/300
Epoch 297/300
Epoch 298/300
Epoch 299/300
Epoch 300/300


In [37]:
result_enc = neigh.fit(enc)
neighbours_enc = result_enc.kneighbors()

In [38]:
enc_r = m.predict(features_train_scaled)      # reconstruction, 2048 -> 32 -> 2048

In [39]:
result_enc_r = neigh.fit(enc_r)
neighbours_enc_r = result_enc.kneighbors()

In [40]:
np.min(features_train_scaled)

0.0

In [41]:
np.min(features_test_scaled)

0.0

In [42]:
np.max(features_train_scaled)

1.0

In [43]:
np.max(features_test_scaled)

0.71462256

In [44]:
print(neighbours[1][0])
print(neighbours_enc_r[1][0])
print(neighbours_enc[1][0])

[1821 2854 2761 3123 3441 1823 3599 3762 2723 2719]
[4093 2854 2723 4018 3123 2739 3762 4224 1821 2642]
[4093 2854 2642 2723 3762 1821 5084 4012 4018 2742]


In [45]:
print(neighbours[1][1])
print(neighbours_enc_r[1][1])
print(neighbours_enc[1][1])

[ 721 2822 3091 4785 4879 2877 1107  208 4134 1022]
[ 721 2822  148 1092 1365  150  204    2 1842 1088]
[ 721 2822  150    2  173  148 1842 1088 1092  204]


In [46]:
print(neighbours[1][2])
print(neighbours_enc_r[1][2])
print(neighbours_enc[1][2])

[ 184  204 1096  225 1090 1088 1085 1089 2822 3232]
[ 204  184 1090  225 1107 1088 1089 1092  721 3232]
[ 204 1090  225  184 1088 1089 1092 1107 1085 1096]


In [47]:
enc.shape

(5323, 32)

In [48]:
features_train_scaled.shape

(5323, 2048)

In [49]:
np.save('features.npy', features_train_scaled)
np.save('alternate_features_compresssed.npy', enc)
np.save('alternate_featuers_reconstructed.npy', enc_r)

In [50]:
np.save('features_test.npy', features_test_scaled)

In [51]:
import json


with open('features_train_info.txt', 'w') as outfile:  
    json.dump(features_train_info, outfile)
    
with open('features_test_info.txt', 'w') as outfile:  
    json.dump(features_test_info, outfile)

In [52]:
from keras.models import load_model

encoder.save('encoder.h5')
m.save('autoencoder.h5')