In [5]:
import os
import re
import pandas as pd
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.applications.vgg16 import preprocess_input

model = VGG16(weights='imagenet', include_top=False)
model.summary()

# to save space in the CSV, only track non-zero values in the features;
# full lists can be reconstructed later
def nonzeroes(arr, filtered):
    for idx, item in enumerate(Arr):
        if type(item) is np.ndarray:
            filtered[idx] = nonzeroes(item, dict())
        else:
            if item != 0:
                filtered[idx] = item
    return filtered

## create a regex pattern
p = re.compile('(\d{8})_img_(\d+).jpg')

# create an array to hold image features which will be turned into a dataframe 
# (faster than starting with a df)
data_arr = []
failcount = 0

# actual image jpgs are not kept within project directory as they would take up too much space
for root, dirs, files in os.walk('/Users/Desktop/imgs'):
    for name in files:
        match = p.match(name)
        if match:
            try:
                img_dict = {
                    'MLSNUM': match.group(1),
                    'IMGNUM': match.group(2)
                }
                path = os.path.join(root, name)
                img = image.load_img(path, target_size=(224,224))
                img_data = image.img_to_array(img)
                img_data = np.expand_dims(img_data, axis=0)
                img_data = preprocess_input(img_data)
                
                features = model.predict(img_data)
                img_dict['FEATURES'] = nonzeroes(features, dict())
                
                data_arr.append(img_dict)
            except OSError:
                failcount += 1
                
print('There were ' + str(failcount) + ' images that failed to load.')

data_df = pd.DataFrame(data_arr)
print(data_df.shape)



data_df['FEATURES'].iloc(0)

Model: "vgg16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         [(None, None, None, 3)]   0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, None, None, 64)    1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, None, None, 64)    36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, None, None, 64)    0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, None, None, 128)   73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, None, None, 128)   147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, None, None, 128)   0     

KeyError: 'FEATURES'