# Calculate Features

this notebook uses a CNN to extract features from the images in the directory. 
The results are saved to a csv.

In [1]:
import tensorflow as tf
import os
import csv
import pandas as pd
import numpy as np
from datetime import datetime as dt


In [2]:
import utils

In [3]:
def prep_metadata_file(csv_fname_in, csv_fname_out = 'graphik_portal_{}.csv'):

    # load picture metadata
    src_fldr = os.path.join('..', 'data','raw','scraped')
    output_fldr = os.path.join('..', 'data','interim')
    csv_fpath = os.path.join(src_fldr, csv_fname_in)

    col_names = ['title','img_url','detail_url','detail_description','object_id', 'request_num']
    df = pd.read_csv(csv_fpath, index_col=0, header=None, na_values=['NaN'])
    df.columns = col_names
    
    #drop NA's
    orig_len = df.shape[0]
    df = df.dropna()
    num_dropped = orig_len - df.shape[0]
    print('dropped {:,} records due to NaN'.format(num_dropped))
    
    # drop request_num col
    df = df.drop(columns=['request_num'])
    
    # remove one row where a header was inaccorectly inserted
    df = df.loc[df['object_id']!='object_id']

    # change object id to int and set as index
    df['object_id'] = df['object_id'].astype(np.int32)
    df = df.set_index('object_id')
    df = df.sort_index()

    # drop rows that do not have images in the processed folder
    img_fldr_path = os.path.join('..', 'data','processed','images')
    img_fpath_lst = utils.get_list_of_files_in_dir(img_fldr_path, file_types = ['jpg', 'jpeg','png'], keep_fldr_path=True)
    img_fpath_ser = pd.Series(img_fpath_lst, name='img_path')

    # remove any duplicate files
    img_fpath_ser = img_fpath_ser.drop_duplicates()

    # set index as object id number
    img_object_id = []

    for f  in img_fpath_ser:

        f_id = os.path.basename(f).split('.')[0]
        try:
            f_id = int(f_id)
            img_object_id.append(f_id)
        except:
            print(f_id)

    img_object_id = np.asarray(img_object_id, dtype=np.int64, order='C')
    img_fpath_ser.index = img_object_id

    # join series with image paths to df
    df = df.join(img_fpath_ser, how='left', sort=False)
    df = df.dropna(subset=['img_path',])

    # write interim result to file
    time_stamp = dt.now().strftime('%Y%m%d%H%M')
    csv_fname_out = 'graphik_portal_{}.csv'.format(time_stamp)

    output_fldr = os.path.join('..', 'data','interim')
    csv_fpath = os.path.join(output_fldr, csv_fname_out)

    df.to_csv(csv_fpath)
    print("wrote out csv {} with {:,} records".format(csv_fpath, df.shape[0]))

    return df.reset_index()

In [43]:
def initialise_model(print_summary=True):
    """initialise the model to be used for feature extraction"""
    
    model_backbone = tf.keras.applications.VGG16(include_top=False,weights='imagenet', input_shape=(224,224,3))
    model_backbone._layers.pop() # drop the last max pooling layer from vgg
    
    pooling_lyr = tf.keras.layers.MaxPool2D(pool_size=(7,7))
    flatten_lyr = tf.keras.layers.Flatten()
    norm_lyr = tf.keras.layers.LayerNormalization()
    model = tf.keras.Sequential([model_backbone, pooling_lyr, flatten_lyr, norm_lyr])
    
    if print_summary==True:
        print(model.summary())
    
    return model

In [44]:
model_backbone = tf.keras.applications.VGG16(include_top=True, weights=None, input_shape=(224,224,3))


In [37]:
bp5 = model.get_layer(index=-3)

In [45]:
model_backbone.summary()

Model: "vgg16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_8 (InputLayer)         [(None, 224, 224, 3)]     0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 112, 112, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 112, 112, 128)     147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 56, 56, 128)       0     

In [41]:
14*14*512

100352

In [5]:
def make_features_csv():
    """
    creates a timestamped empty csv file to hold extracted features
    returns fpath to the file
    """
        
    fldr_path = os.path.join('..', 'data','processed')

    time_stamp = dt.now().strftime('%Y%m%d%H%M')
    fname = 'features_{}.csv'.format(time_stamp)
    fpath = os.path.join(fldr_path, fname)

    with open(fpath, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)

    return fpath


In [6]:
def append_tf_features_to_csv(features, labels, fpath):
    """
    convert tensorflow features and labels to numpy arrays and append
    them to an existing csv
    """
    
    labels_column = np.expand_dims(labels.numpy(),axis=1)
    lab_feat_arr = np.hstack((labels_column, features))

    with open(fpath, 'a', newline='') as csvfile:
        np.savetxt(csvfile, lab_feat_arr, delimiter=',')
        
    return  
    

# prep metadata file

In [28]:
csv_fname_in = 'graphik_portal_results.csv'

df = prep_metadata_file(csv_fname_in)
df.head()

dropped 5 records due to NaN
found 9994 existing images
wrote out csv ../data/interim/graphik_portal_202010121646.csv with 9,994 records


Unnamed: 0,object_id,title,img_url,detail_url,detail_description,img_path
0,3,Marcus Curtius stürzt sich in die Erdspalte,https://www.e-gs.ethz.ch/eMP/eMuseumPlus?servi...,https://www.graphikportal.org/document/gpo0021...,Monogrammist IB [Nagler III 1950] (Erwähnt um ...,../data/processed/images/0/3.png
1,18,Die Philister bringen die Bundeslade in den Te...,https://www.e-gs.ethz.ch/eMP/eMuseumPlus?servi...,https://www.graphikportal.org/document/gpo0021...,"Battista Franco (Um 1510 - 1561), Um 1525 - 1561",../data/processed/images/0/18.png
2,19,Der grosse Saal im Schloss in Prag [Linke Bild...,https://www.e-gs.ethz.ch/eMP/eMuseumPlus?servi...,https://www.graphikportal.org/document/gpo0021...,Egidius Sadeler (der Jüngere) (Um 1570 - 1629)...,../data/processed/images/0/19.png
3,33,Die schöne Försterin,https://www.e-gs.ethz.ch/eMP/eMuseumPlus?servi...,https://www.graphikportal.org/document/gpo0021...,"Henry Wyatt (1794 - 1840), nach, 1835, Francis...",../data/processed/images/0/33.png
4,52,Stigmatisation des heiligen Franziskus,https://www.e-gs.ethz.ch/eMP/eMuseumPlus?servi...,https://www.graphikportal.org/document/gpo0021...,"Agostino Carracci (1557 - 1602), Ca. 1583",../data/processed/images/0/52.png


# create tensorflow dataset

In [29]:
batch_size=32

ds = utils.make_tfdataset_from_df(df,
                           'img_path', 
                           'object_id',
                           batch_size=batch_size,
                           for_training=False,
                           normalize=False,
                           augment=False,
                           augment_func=None,
                           rgb_values=([0,0,0],[1,1,1]),
                           conv_color='rgb')

# Load Model

In [31]:
m2.summary()

Model: "resnet50"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            [(None, 224, 224, 3) 0                                            
__________________________________________________________________________________________________
conv1_pad (ZeroPadding2D)       (None, 230, 230, 3)  0           input_5[0][0]                    
__________________________________________________________________________________________________
conv1_conv (Conv2D)             (None, 112, 112, 64) 9472        conv1_pad[0][0]                  
__________________________________________________________________________________________________
conv1_bn (BatchNormalization)   (None, 112, 112, 64) 256         conv1_conv[0][0]                 
___________________________________________________________________________________________

In [33]:
x = tf.constant([[1., 2., 3.],
                 [4., 5., 6.],
                 [7., 8., 9.]])
x = tf.reshape(x, [1, 3, 3, 1])
max_pool_2d = tf.keras.layers.MaxPooling2D(pool_size=(1, 1),
   strides=(1, 1), padding='valid')
max_pool_2d(x)






<tf.Tensor: shape=(1, 3, 3, 1), dtype=float32, numpy=
array([[[[1.],
         [2.],
         [3.]],

        [[4.],
         [5.],
         [6.]],

        [[7.],
         [8.],
         [9.]]]], dtype=float32)>

In [13]:
images, labels =  next(iter(ds))
#extract features
features = m2.predict(images)

In [15]:
features.shape

(32, 7, 7, 2048)

In [18]:
features[0,:].flatten().shape

(100352,)

In [46]:
model = initialise_model()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
vgg16 (Model)                (None, 7, 7, 512)         14714688  
_________________________________________________________________
max_pooling2d_3 (MaxPooling2 (None, 1, 1, 512)         0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 512)               0         
_________________________________________________________________
layer_normalization_1 (Layer (None, 512)               1024      
Total params: 14,715,712
Trainable params: 14,715,712
Non-trainable params: 0
_________________________________________________________________
None


# Extract Features

In [47]:
fpath = make_features_csv()

num_images = df.shape[0]
total_steps = int(np.ceil(num_images / batch_size))
print('starting feature extraction for:')
print('    {:,} images in {:,} steps'.format(num_images, total_steps))
print('    writing features to {}'.format(fpath))

for i, (images, labels) in enumerate(iter(ds)):
    #extract features
    features = model.predict(images)
    append_tf_features_to_csv(features, labels, fpath)
    #update progress
    utils.print_dyn_progress_bar(total_steps,i)

starting feature extraction for:
    9,994 images in 313 steps
    writing features
[-------------------------------------------------->] 100.00%