In [1]:
import argparse
import logging
import db.mariadb as mariadb
import os
import MySQLdb

import imageio
import numpy as np

import keras
from keras.preprocessing import image

from keras.applications.resnet50 import preprocess_input
from requests_futures.sessions import FuturesSession
from PIL.Image import DecompressionBombError

logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
logging.basicConfig(format='%(asctime)s-%(levelname)s-%(name)s - %(message)s')


model = keras.applications.resnet50.ResNet50(include_top=False, pooling='avg')


Using TensorFlow backend.
Instructions for updating:
Colocations handled automatically by placer.


In [2]:
# Change according to your database setup (these are the defaults, see .env file)

con = mariadb.get_connection("127.0.0.1", 3308, "image_processing", "user", "user_pw")

files_batch = mariadb.get_files_data(0, 800000, con)

con.close()

# Use every 100th image of the 800000
files_batch = files_batch[::100]
features_train = []
features_test = []

features_train_info = []
features_test_info = []

# Create training and test features
counter = 1
for (img_id, name, path, url) in files_batch:
    try:
        img = image.load_img(path, target_size=(224, 224))
        # print(type(img))
        img_data = image.img_to_array(img)
        # print(type(img_data))
        # print(img_data.shape)
        img_data = np.expand_dims(img_data, axis=0)
        # print(img_data.shape)
        img_data = preprocess_input(img_data)
        # print(img_data.shape)
        res_net_feature = model.predict(img_data)
        # print(res_net_feature.shape)
        res_net_feature = np.array(res_net_feature).flatten()
        # print(res_net_feature.shape)

        if counter % 3 == 0:
            features_test.append(res_net_feature)
            features_test_info.append((img_id, name, path, url))
        else:
            features_train.append(res_net_feature)
            features_train_info.append((img_id, name, path, url))

        counter += 1

    except OSError as e:
        print(e)
    except DecompressionBombError as e:
        print(e)
        print(path)

features_train = np.array(features_train)
features_test = np.array(features_test)

print(features_train.shape)
print(len(features_train_info))

print(features_test.shape)
print(len(features_test_info))

cannot identify image file '/home/shohl/Bilder/idai_cloud_mount/FADatenbankabb0700c/oA_14610.jpg'
cannot identify image file '/home/shohl/Bilder/idai_cloud_mount/FADatenbankabb0700b/FA-Kae54-22_002921808,02.jpg'
cannot identify image file '/home/shohl/Bilder/idai_cloud_mount/FADatenbankabb0700d/FA996-12_0002921601,11.jpg'
cannot identify image file '/home/shohl/Bilder/idai_cloud_mount/FADatenbankabb0700d/Mal1043-01_21751,03.jpg'
cannot identify image file '/home/shohl/Bilder/idai_cloud_mount/FADatenbankabb0700d/Mal18-07_140891,01.jpg'




cannot identify image file '/home/shohl/Bilder/idai_cloud_mount/FADatenbankabb0575/D-DAI-ROM-56.927_002922407.jpg'
cannot identify image file '/home/shohl/Bilder/idai_cloud_mount/FADatenbankabb0577/D-DAI-ROM-67.16_56977.jpg'
cannot identify image file '/home/shohl/Bilder/idai_cloud_mount/FADatenbankabb0577/D-DAI-ROM-71.624_000300141352.jpg'




Image size (205393290 pixels) exceeds limit of 178956970 pixels, could be decompression bomb DOS attack.
/home/shohl/Bilder/idai_cloud_mount/dai-rom-fotothek-2007/Bestand-D-DAI-ROM-2007.5627.JPG




cannot identify image file '/home/shohl/Bilder/idai_cloud_mount/dai-rom-fotothek-2003/Bestand-D-DAI-ROM-2003.0067.JPG'
cannot identify image file '/home/shohl/Bilder/idai_cloud_mount/dai-rom-fotothek-2003/Bestand-D-DAI-ROM-2003.0175.JPG'
cannot identify image file '/home/shohl/Bilder/idai_cloud_mount/dai-rom-fotothek-2003/Bestand-D-DAI-ROM-2003.0277.JPG'
cannot identify image file '/home/shohl/Bilder/idai_cloud_mount/dai-rom-fotothek-2003/Bestand-D-DAI-ROM-2003.0376.JPG'
cannot identify image file '/home/shohl/Bilder/idai_cloud_mount/dai-rom-fotothek-2003/Bestand-D-DAI-ROM-2003.0594.JPG'
cannot identify image file '/home/shohl/Bilder/idai_cloud_mount/dai-rom-fotothek-2003/Bestand-D-DAI-ROM-2003.0694.JPG'
cannot identify image file '/home/shohl/Bilder/idai_cloud_mount/dai-rom-fotothek-2003/Bestand-D-DAI-ROM-2003.0794.JPG'
(5323, 2048)
5323
(2661, 2048)
2661


In [3]:
# We need to scale the features to values between 0 and 1, because the sigmoid layer (last autoencoder layer) 
# produces values in that range. Without scaling the original values down, the autoencoder won't be able to 
# reproduce input values > 1.

print(np.max(features_train))
print(np.max(features_test))

if(np.max(features_train) > np.max(features_test)):
    features_train_scaled = features_train / np.max(features_train)
    features_test_scaled = features_test / np.max(features_train)
else:
    features_train_scaled = features_train / np.max(features_test)
    features_test_scaled = features_test / np.max(features_test)

print(np.max(features_train_scaled))
print(np.max(features_test_scaled))

29.489676
21.073988
1.0
0.71462256


In [4]:
from sklearn.neighbors import NearestNeighbors
neigh = NearestNeighbors(n_neighbors=10)

result = neigh.fit(features_train_scaled)

neighbours = result.kneighbors()

In [5]:
print(len(neighbours))
print(len(neighbours[0]))
print(neighbours[0])
print(len(neighbours[1]))
print(neighbours[1])

2
5323
[[0.8869949  0.95451961 0.96011991 ... 0.9791909  0.98546634 0.9955557 ]
 [1.26883223 1.30483152 1.3814718  ... 1.44745823 1.44969341 1.45190205]
 [1.1523444  1.19131711 1.20262143 ... 1.2408876  1.2468743  1.2603429 ]
 ...
 [1.03671313 1.03765333 1.06060254 ... 1.08386614 1.08407201 1.08443034]
 [1.06930514 1.07236736 1.08549986 ... 1.11641227 1.11959069 1.12072677]
 [0.74386297 0.77322731 0.79062213 ... 0.83160336 0.83692799 0.84112598]]
5323
[[1821 2854 2761 ... 3762 2723 2719]
 [ 721 2822 3091 ...  208 4134 1022]
 [ 184  204 1096 ... 1089 2822 3232]
 ...
 [4274 2145 4962 ... 1395 2131 3367]
 [5193 1383 4025 ... 1060 4137 1394]
 [5192 5314  160 ... 4712 5181 1056]]


In [7]:
# Initialize and run autoencoder

from keras.models import Sequential, Model
from keras.layers import Dense
from keras.optimizers import Adam, Adadelta
from keras import regularizers
import keras.backend as K

def euclidean_distance_loss(y_true, y_pred):
    """
    Euclidean distance loss
    https://en.wikipedia.org/wiki/Euclidean_distance
    :param y_true: TensorFlow/Theano tensor
    :param y_pred: TensorFlow/Theano tensor of the same shape as y_true
    :return: float
    """
    return K.sqrt(K.sum(K.square(y_pred - y_true), axis=-1))

# Stop if there has been no improvement for 100 epochs
es = keras.callbacks.EarlyStopping(verbose=1, patience=100)
# Save best model while training
mc = keras.callbacks.ModelCheckpoint('best_model.h5')

callbacks_list = [es, mc]

m = Sequential()

m.add(Dense(512, activation='elu', input_shape=(features_train_scaled.shape[1],)))
m.add(Dense(32, activation='linear', name="bottleneck"))
m.add(Dense(512, activation='elu'))
m.add(Dense(features_train.shape[1], activation='sigmoid'))

m.compile(loss=euclidean_distance_loss, optimizer=Adadelta())

history = m.fit(features_train_scaled, features_train_scaled, batch_size=128, epochs=100000, verbose=1,
                validation_data=(features_test_scaled, features_test_scaled), callbacks=callbacks_list)


Train on 5323 samples, validate on 2661 samples
Epoch 1/100000
Epoch 2/100000
Epoch 3/100000
Epoch 4/100000
Epoch 5/100000
Epoch 6/100000
Epoch 7/100000
Epoch 8/100000
Epoch 9/100000
Epoch 10/100000
Epoch 11/100000
Epoch 12/100000
Epoch 13/100000
Epoch 14/100000
Epoch 15/100000
Epoch 16/100000
Epoch 17/100000
Epoch 18/100000
Epoch 19/100000
Epoch 20/100000
Epoch 21/100000
Epoch 22/100000
Epoch 23/100000
Epoch 24/100000
Epoch 25/100000
Epoch 26/100000
Epoch 27/100000
Epoch 28/100000
Epoch 29/100000
Epoch 30/100000
Epoch 31/100000
Epoch 32/100000
Epoch 33/100000
Epoch 34/100000
Epoch 35/100000
Epoch 36/100000
Epoch 37/100000
Epoch 38/100000
Epoch 39/100000
Epoch 40/100000
Epoch 41/100000
Epoch 42/100000
Epoch 43/100000
Epoch 44/100000
Epoch 45/100000
Epoch 46/100000
Epoch 47/100000
Epoch 48/100000
Epoch 49/100000
Epoch 50/100000
Epoch 51/100000
Epoch 52/100000
Epoch 53/100000
Epoch 54/100000
Epoch 55/100000
Epoch 56/100000
Epoch 57/100000
Epoch 58/100000
Epoch 59/100000
Epoch 60/100000
E

Epoch 76/100000
Epoch 77/100000
Epoch 78/100000
Epoch 79/100000
Epoch 80/100000
Epoch 81/100000
Epoch 82/100000
Epoch 83/100000
Epoch 84/100000
Epoch 85/100000
Epoch 86/100000
Epoch 87/100000
Epoch 88/100000
Epoch 89/100000
Epoch 90/100000
Epoch 91/100000
Epoch 92/100000
Epoch 93/100000
Epoch 94/100000
Epoch 95/100000
Epoch 96/100000
Epoch 97/100000
Epoch 98/100000
Epoch 99/100000
Epoch 100/100000
Epoch 101/100000
Epoch 102/100000
Epoch 103/100000
Epoch 104/100000
Epoch 105/100000
Epoch 106/100000
Epoch 107/100000
Epoch 108/100000
Epoch 109/100000
Epoch 110/100000
Epoch 111/100000
Epoch 112/100000
Epoch 113/100000
Epoch 114/100000
Epoch 115/100000
Epoch 116/100000
Epoch 117/100000
Epoch 118/100000
Epoch 119/100000
Epoch 120/100000
Epoch 121/100000
Epoch 122/100000
Epoch 123/100000
Epoch 124/100000
Epoch 125/100000
Epoch 126/100000
Epoch 127/100000
Epoch 128/100000
Epoch 129/100000
Epoch 130/100000
Epoch 131/100000
Epoch 132/100000
Epoch 133/100000
Epoch 134/100000
Epoch 135/100000
Epoc

Epoch 151/100000
Epoch 152/100000
Epoch 153/100000
Epoch 154/100000
Epoch 155/100000
Epoch 156/100000
Epoch 157/100000
Epoch 158/100000
Epoch 159/100000
Epoch 160/100000
Epoch 161/100000
Epoch 162/100000
Epoch 163/100000
Epoch 164/100000
Epoch 165/100000
Epoch 166/100000
Epoch 167/100000
Epoch 168/100000
Epoch 169/100000
Epoch 170/100000
Epoch 171/100000
Epoch 172/100000
Epoch 173/100000
Epoch 174/100000
Epoch 175/100000
Epoch 176/100000
Epoch 177/100000
Epoch 178/100000
Epoch 179/100000
Epoch 180/100000
Epoch 181/100000
Epoch 182/100000
Epoch 183/100000
Epoch 184/100000
Epoch 185/100000
Epoch 186/100000
Epoch 187/100000
Epoch 188/100000
Epoch 189/100000
Epoch 190/100000
Epoch 191/100000
Epoch 192/100000
Epoch 193/100000
Epoch 194/100000
Epoch 195/100000
Epoch 196/100000
Epoch 197/100000
Epoch 198/100000
Epoch 199/100000
Epoch 200/100000
Epoch 201/100000
Epoch 202/100000
Epoch 203/100000
Epoch 204/100000
Epoch 205/100000
Epoch 206/100000
Epoch 207/100000
Epoch 208/100000
Epoch 209/1000

Epoch 225/100000
Epoch 226/100000
Epoch 227/100000
Epoch 228/100000
Epoch 229/100000
Epoch 230/100000
Epoch 231/100000
Epoch 232/100000
Epoch 233/100000
Epoch 234/100000
Epoch 235/100000
Epoch 236/100000
Epoch 237/100000
Epoch 238/100000
Epoch 239/100000
Epoch 240/100000
Epoch 241/100000
Epoch 242/100000
Epoch 243/100000
Epoch 244/100000
Epoch 245/100000
Epoch 246/100000
Epoch 247/100000
Epoch 248/100000
Epoch 249/100000
Epoch 250/100000
Epoch 251/100000
Epoch 252/100000
Epoch 253/100000
Epoch 254/100000
Epoch 255/100000
Epoch 256/100000
Epoch 257/100000
Epoch 258/100000
Epoch 259/100000
Epoch 260/100000
Epoch 261/100000
Epoch 262/100000
Epoch 263/100000
Epoch 264/100000
Epoch 265/100000
Epoch 266/100000
Epoch 267/100000
Epoch 268/100000
Epoch 269/100000
Epoch 270/100000
Epoch 271/100000
Epoch 272/100000
Epoch 273/100000
Epoch 274/100000
Epoch 275/100000
Epoch 276/100000
Epoch 277/100000
Epoch 278/100000
Epoch 279/100000
Epoch 280/100000
Epoch 281/100000
Epoch 282/100000
Epoch 283/1000

Epoch 299/100000
Epoch 300/100000
Epoch 301/100000
Epoch 302/100000
Epoch 303/100000
Epoch 304/100000
Epoch 305/100000
Epoch 306/100000
Epoch 307/100000
Epoch 308/100000
Epoch 309/100000
Epoch 310/100000
Epoch 311/100000
Epoch 312/100000
Epoch 313/100000
Epoch 314/100000
Epoch 315/100000
Epoch 316/100000
Epoch 317/100000
Epoch 318/100000
Epoch 319/100000
Epoch 320/100000
Epoch 321/100000
Epoch 322/100000
Epoch 323/100000
Epoch 324/100000
Epoch 325/100000
Epoch 326/100000
Epoch 327/100000
Epoch 328/100000
Epoch 329/100000
Epoch 330/100000
Epoch 331/100000
Epoch 332/100000
Epoch 333/100000
Epoch 334/100000
Epoch 335/100000
Epoch 336/100000
Epoch 337/100000
Epoch 338/100000
Epoch 339/100000
Epoch 340/100000
Epoch 341/100000
Epoch 342/100000
Epoch 343/100000
Epoch 344/100000
Epoch 345/100000
Epoch 346/100000
Epoch 347/100000
Epoch 348/100000
Epoch 349/100000
Epoch 350/100000
Epoch 351/100000
Epoch 352/100000
Epoch 353/100000
Epoch 354/100000
Epoch 355/100000
Epoch 356/100000
Epoch 357/1000

Epoch 373/100000
Epoch 374/100000
Epoch 375/100000
Epoch 376/100000
Epoch 377/100000
Epoch 378/100000
Epoch 379/100000
Epoch 380/100000
Epoch 381/100000
Epoch 382/100000
Epoch 383/100000
Epoch 384/100000
Epoch 385/100000
Epoch 386/100000
Epoch 387/100000
Epoch 388/100000
Epoch 389/100000
Epoch 390/100000
Epoch 391/100000
Epoch 392/100000
Epoch 393/100000
Epoch 394/100000
Epoch 395/100000
Epoch 396/100000
Epoch 397/100000
Epoch 398/100000
Epoch 399/100000
Epoch 400/100000
Epoch 401/100000
Epoch 402/100000
Epoch 403/100000
Epoch 404/100000
Epoch 405/100000
Epoch 406/100000
Epoch 407/100000
Epoch 408/100000
Epoch 409/100000
Epoch 410/100000
Epoch 411/100000
Epoch 412/100000
Epoch 413/100000
Epoch 414/100000
Epoch 415/100000
Epoch 416/100000
Epoch 417/100000
Epoch 418/100000
Epoch 419/100000
Epoch 420/100000
Epoch 421/100000
Epoch 422/100000
Epoch 423/100000
Epoch 424/100000
Epoch 425/100000
Epoch 426/100000
Epoch 427/100000
Epoch 428/100000
Epoch 429/100000
Epoch 430/100000
Epoch 431/1000

Epoch 447/100000
Epoch 448/100000
Epoch 449/100000
Epoch 450/100000
Epoch 451/100000
Epoch 452/100000
Epoch 453/100000
Epoch 454/100000
Epoch 455/100000
Epoch 456/100000
Epoch 457/100000
Epoch 458/100000
Epoch 459/100000
Epoch 460/100000
Epoch 461/100000
Epoch 462/100000
Epoch 463/100000
Epoch 464/100000
Epoch 465/100000
Epoch 466/100000
Epoch 467/100000
Epoch 468/100000
Epoch 469/100000
Epoch 470/100000
Epoch 471/100000
Epoch 472/100000
Epoch 473/100000
Epoch 474/100000
Epoch 475/100000
Epoch 476/100000
Epoch 477/100000
Epoch 478/100000
Epoch 479/100000
Epoch 480/100000
Epoch 481/100000
Epoch 482/100000
Epoch 483/100000
Epoch 484/100000
Epoch 485/100000
Epoch 486/100000
Epoch 487/100000
Epoch 488/100000
Epoch 489/100000
Epoch 490/100000
Epoch 491/100000
Epoch 492/100000
Epoch 493/100000
Epoch 494/100000
Epoch 495/100000
Epoch 496/100000
Epoch 497/100000
Epoch 498/100000
Epoch 499/100000
Epoch 500/100000
Epoch 501/100000
Epoch 502/100000
Epoch 503/100000
Epoch 504/100000
Epoch 505/1000

Epoch 521/100000
Epoch 522/100000
Epoch 523/100000
Epoch 524/100000
Epoch 525/100000
Epoch 526/100000
Epoch 527/100000
Epoch 528/100000
Epoch 529/100000
Epoch 530/100000
Epoch 531/100000
Epoch 532/100000
Epoch 533/100000
Epoch 534/100000
Epoch 535/100000
Epoch 536/100000
Epoch 537/100000
Epoch 538/100000
Epoch 539/100000
Epoch 540/100000
Epoch 541/100000
Epoch 542/100000
Epoch 543/100000
Epoch 544/100000
Epoch 545/100000
Epoch 546/100000
Epoch 547/100000
Epoch 548/100000
Epoch 549/100000
Epoch 550/100000
Epoch 551/100000
Epoch 552/100000
Epoch 553/100000
Epoch 554/100000
Epoch 555/100000
Epoch 556/100000
Epoch 557/100000
Epoch 558/100000
Epoch 559/100000
Epoch 560/100000
Epoch 561/100000
Epoch 562/100000
Epoch 563/100000
Epoch 564/100000
Epoch 565/100000
Epoch 566/100000
Epoch 567/100000
Epoch 568/100000
Epoch 569/100000
Epoch 570/100000
Epoch 571/100000
Epoch 572/100000
Epoch 573/100000
Epoch 574/100000
Epoch 575/100000
Epoch 576/100000
Epoch 577/100000
Epoch 578/100000
Epoch 579/1000

Epoch 595/100000
Epoch 596/100000
Epoch 597/100000
Epoch 598/100000
Epoch 599/100000
Epoch 600/100000
Epoch 601/100000
Epoch 602/100000
Epoch 603/100000
Epoch 604/100000
Epoch 605/100000
Epoch 606/100000
Epoch 607/100000
Epoch 608/100000
Epoch 609/100000
Epoch 610/100000
Epoch 611/100000
Epoch 612/100000
Epoch 613/100000
Epoch 614/100000
Epoch 615/100000
Epoch 616/100000
Epoch 617/100000
Epoch 618/100000
Epoch 619/100000
Epoch 620/100000
Epoch 621/100000
Epoch 622/100000
Epoch 623/100000
Epoch 624/100000
Epoch 625/100000
Epoch 626/100000
Epoch 627/100000
Epoch 628/100000
Epoch 629/100000
Epoch 630/100000
Epoch 631/100000
Epoch 632/100000
Epoch 633/100000
Epoch 634/100000
Epoch 635/100000
Epoch 636/100000
Epoch 637/100000
Epoch 638/100000
Epoch 639/100000
Epoch 640/100000
Epoch 641/100000
Epoch 642/100000
Epoch 643/100000
Epoch 644/100000
Epoch 00644: early stopping


In [9]:
from keras.models import load_model
best_model = load_model('best_model.h5', custom_objects={'euclidean_distance_loss': euclidean_distance_loss})
encoder = Model(best_model.input, best_model.get_layer('bottleneck').output)

enc = encoder.predict(features_train_scaled) # returns the encoded values (32 floats instead of 2048)


In [10]:
result_enc = neigh.fit(enc)
neighbours_enc = result_enc.kneighbors()

In [11]:
enc_r = best_model.predict(features_train_scaled)      # reconstruction, 2048 -> 32 -> 2048

In [12]:
result_enc_r = neigh.fit(enc_r)
neighbours_enc_r = result_enc.kneighbors()

In [13]:
np.min(features_train_scaled)

0.0

In [14]:
np.min(features_test_scaled)

0.0

In [15]:
np.max(features_train_scaled)

1.0

In [16]:
np.max(features_test_scaled)

0.71462256

In [17]:
print(neighbours[1][0])
print(neighbours_enc_r[1][0])
print(neighbours_enc[1][0])

[1821 2854 2761 3123 3441 1823 3599 3762 2723 2719]
[1821 2854 4093 4518 3115 4012 3762 1499 5149 3123]
[4093 4518 1821 3762 2723 2642 2854 4012 3225 4018]


In [18]:
print(neighbours[1][1])
print(neighbours_enc_r[1][1])
print(neighbours_enc[1][1])

[ 721 2822 3091 4785 4879 2877 1107  208 4134 1022]
[ 721 2822 3091  150 4785 1365 4879  208 1088 1107]
[ 721 2822  150  187 1107  173 1022  185 1092  177]


In [19]:
print(neighbours[1][2])
print(neighbours_enc_r[1][2])
print(neighbours_enc[1][2])

[ 184  204 1096  225 1090 1088 1085 1089 2822 3232]
[ 204  184 1090 1088 1107  225 1089 1096 3232 1085]
[ 204 1090 1088  184  225 1107 1089 1092  173 2495]


In [20]:
enc.shape

(5323, 32)

In [21]:
features_train_scaled.shape

(5323, 2048)

In [22]:
encoder.save('encoder.h5')